def __init__(self, filename): # .xls is not stuctured similarly to .narrowPeak if filename.endswith(".xls"): # self.df = pd.read_csv(filename, comment="#", sep="\t") for x in [ 'chr', 'start', 'end', 'abs_summit', '-log10(pvalue)', 'fold_enrichment', '-log10(qvalue)', 'name' ]: # we could have also pileup depeding on the user's options assert x in self.df.columns self.df['stop'] = self.df['end'] elif filename.endswith("narrowPeak"): self.df = pd.read_csv(filename, header=None, sep='\t') self.df.columns = [ 'chr', 'start', 'stop', 'name', 'score', 'NA', 'fold_enrichment', '-log10(pvalue)', '-log10(qvalue)', 'abs_summit_from_start' ] self.df['end'] = self.df['stop'] elif filename.endswith("broadPeak"): self.df = pd.read_csv(filename, header=None, sep='\t') self.df.columns = [ 'chr', 'start', 'stop', 'name', 'score', 'NA', 'fold_enrichment', '-log10(pvalue)', '-log10(qvalue)' ] self.df['end'] = self.df['stop'] self.df['length'] = self.df['stop'] - self.df['start']
def __init__(self, names="names.dmp", nodes="nodes.dmp", verbose=True): if verbose: print("Reading %s" % names) self.df_name = pd.read_csv(names, sep='\t', header=None) self.df_name = self.df_name[[0, 2, 6]] self.df_name.columns = ["taxon", "name", "scname"] # This will provide a faster lookup table to search for scientific # names given a taxon. We can drop rows that are not scientific names # and set the taxons as index _subdf = self.df_name.query("'scientific name' in scname") self._subdf = _subdf.set_index("taxon") # Here, this is for general purpose (slower it we were to use # this for the get_scientic_name method self._group_name = self.df_name.groupby('taxon').groups if verbose: print("Reading %s" % nodes) self.df_nodes = pd.read_csv(nodes, sep='\t', header=None) self.df_nodes = self.df_nodes[[ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24 ]] self.df_nodes.columns = [ 'taxon', 'parent_taxon', 'rank', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ] self._df_nodes_taxon = self.df_nodes.copy() self._df_nodes_taxon.set_index('taxon', inplace=True)
def __init__(self, filename): self.filename = filename self.df = pd.read_csv(self.filename, sep='\t', comment="#", header=None) self.metadata = {} with open(self.filename, "r") as fin: line = fin.readline() while line.startswith('#'): key, value = line.split(maxsplit=1) key = key[1:] # skip the # character try: self.metadata[key] = int(value.strip()) except: self.metadata[key] = value.strip() if key.endswith("%"): try: self.metadata[key] = float(value.strip()) except: #pragma: no cover pass line = fin.readline() try: ncounts = self.metadata['N_Count'] self.metadata['N_Count'] = int(ncounts.split('\t')[0]) self.metadata['N_Count%'] = float(ncounts.split('\t')[1] ) except Exception as err: #pragma no cover print(err) pass self.df.columns = ['Pos'] + self.metadata['Pos'].split('\t') del self.metadata['Pos'] self.df.set_index('Pos', inplace=True)
def __init__(self, data): """.. rubric:: constructor :param data: it can be a csv filename created by sequana.freebayes_vcf_filter or a :class:`freebayes_vcf_filter.Filtered_freebayes` object. """ super().__init__() self.title = "Variant Calling Report" try: with open(data, "r") as fp: self.filename = data line = fp.readline() if line.startswith("# sequana_variant_calling"): string_dict = line.split(";")[-1].strip() try: self.filter_dict = ast.literal_eval(string_dict) except SyntaxError: self.filter_dict = None self.df = pd.read_csv(fp) except FileNotFoundError: msg = ("The csv file is not present. Please, check if your" " file is present.") raise FileNotFoundError(msg) except TypeError: self.df = data.df self.filter_dict = data.vcf.filters_params self.create_report_content() self.create_html("variant_calling.html")
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0, 2, 3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def __init__(self, directory=".", prefix=""): self.prefix = prefix self.directory = directory self.sample_name = "undefined" # low quality isoforms filename = "all.polished_lq.fastq" self.lq_isoforms = self.get_file(filename) if self.lq_isoforms: logger.info("Reading {}".format(filename)) self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms filename = "all.polished_hq.fastq" self.hq_isoforms = self.get_file(filename) if self.hq_isoforms: logger.info("Reading {}".format(filename)) self.hq_sequence = FastQ(self.hq_isoforms) # General info filename = "file.csv" self.csv = self.get_file(filename) if self.csv: logger.info("Reading {}".format(filename)) self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") filename = "ccs.fasta" self.ccs = self.get_file(filename, noprefix=True) if self.ccs: logger.info("Reading {}".format(filename)) self.ccs = FastA(self.ccs)
def check_and_save_input_tables(self, sep_counts, sep_design): try: self.outdir.mkdir() except: pass counts = pd.read_csv(self.usr_counts, sep=sep_counts, index_col="Geneid", comment="#").sort_index(axis=1) design = RNADesign(self.usr_design, sep=sep_design) design = design.df.set_index("label").sort_index() if list(counts.columns) != list(design.index): logger.error( f"Counts columns and design rows does not match (after sorting)." ) logger.error(counts.columns) logger.error(design.index) sys.exit(1) counts.to_csv(self.counts_filename) design.to_csv(self.design_filename) return counts, design
def get_df(self): import pandas as pd data = {} for sample, filename in zip(self.sample_names, self.filenames): df = pd.read_csv(filename) df = df.groupby("kingdom")['percentage'].sum() # if a taxon is obsolete, the kingdom is empty. # We will set the kingdom as Unclassified and raise a warning # if the count is > 5% if " " in df.index: percent = df.loc[" "] if percent > 5: logger.warning( "Found {}% of taxons in obsolete category".format( percent)) if "Unclassified" in df.index: df.loc['Unclassified'] += df.loc[' '] df.drop(" ", inplace=True) else: df.loc['Unclassified'] = df.loc[' '] df.drop(" ", inplace=True) data[sample] = df df = pd.DataFrame(data) #df.to_json(output.data) df = df.sort_index(ascending=False) return df
def __init__(self, filename): self.df = pd.read_csv(filename, sep='\t', header=None) self.df.columns = [ 'filename', 'num_reads', 'estimated_fragment_length', 'corr', 'phantom_peak', 'corr_phantom_peak', 'argmin_corr', 'min_corr', 'NSC', 'RSC', 'quality_tag' ]
def read_align(self, readfile): self.df = pd.read_csv(readfile, sep='\t', header=None) self.df.columns = ['ref', 'start', 'end', 'dummy', 'quality', 'strand'] from pylab import median self.read_length = round(median(self.df['end'] - self.df['start'])) self.chromosomes = self.df['ref'].unique()
def __init__(self, filename="full_table_testbusco.tsv"): """.. rubric:: constructor :filename: a valid BUSCO input file (full table). See example in sequana code source (testing) """ self.df = pd.read_csv(filename, sep="\t", skiprows=4)
class Taxonomy(object): from sequana import sequana_data # must be local df = pd.read_csv(sequana_data("test_taxon_rtd.csv"), index_col=0) def get_lineage_and_rank(self, x): # Note that we add the name as well here ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'name'] return [(self.df.ix[x][rank], rank) for rank in ranks]
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0,2,3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def _read_csv(self, input_filename): """ Read csv generated by :class:'GenomeCov' and create :class:'ChromosomeCov' list. """ # set regex to get important information about previous analysis re_threshold = re.compile("thresholds:([\d,\.-]+)") re_window_size = re.compile("\swindow_size:(\d+)") re_circular = re.compile("circular:(\w+)") re_gc_window_size = re.compile("gc_window_size:(\d+)") re_genbank = re.compile("genbank:([\{0}\w\.\-]+)".format(os.sep)) re_chrom = re.compile("^# ([\w\-\.]+):") re_gaussian = re.compile("(\[\{.+\}\])") with open(input_filename, "r") as fp: line = fp.readline() # check if file was generated by sequana_coverage if not line.startswith("# sequana_coverage"): return None # get thresholds thresholds = re_threshold.findall(line)[0] thresholds = [float(f) for f in thresholds.split(',')] self.thresholds = DoubleThresholds(*thresholds) # get window size self.window_size = int(re_window_size.search(line).group(1)) # get circular circular = re_circular.search(line).group(1) self.circular = False if circular == "False" else True # get gc_window_size gc = re_gc_window_size.search(line) if gc: self.gc_window_size = int(gc.group(1)) # get genbank gb = re_genbank.search(line) if gb and not self.genbank_filename: self.genbank_filename = gb.group(1) # get gaussians for each chromosome gaussians_dict = dict() for line in fp: chrom = re_chrom.search(line) if chrom: gaussians = re_gaussian.search(line) gaussians = ast.literal_eval(gaussians.group(1)) gaussians_dict[chrom.group(1)] = gaussians else: break df = pd.read_csv(fp, header=None, names=line.strip().split(",")) chr_list = self._set_chr_list(df) # Add gaussians and range informations for chrom in chr_list: chrom.set_gaussians(gaussians_dict[chrom.chrom_name]) if self.circular: chrom.range = [None, None] else: mid = int(self.window_size/2) chrom.range = [mid, -mid] chrom.mixture_fitting = mixture.EM( chrom.df['scale'][chrom.range[0]:chrom.range[1]]) return chr_list
def read(self, filename): df = pd.read_csv(filename, sep='\t', header=None) df.columns = [ 'filename', 'num_reads', 'estimated_fragment_length', 'corr', 'phantom_peak', 'corr_phantom_peak', 'argmin_corr', 'min_corr', 'NSC', 'RSC', 'quality_tag' ] self.df = self.df.append(df) self.df.reset_index(inplace=True, drop=True)
def scanner(self): data = {} # shlex removes all white lines and split by return carriage # strip is also applied rawdata = shlex.split(open(self.filename, "r")) for line in rawdata: # sometimes, IEM will store the ;;; at the end # so we can get [HEADER];;;;;;;;;;; if line.startswith('[') and "]" in line: line = line.strip(";").strip(",").strip() currentkey = line.replace("[", "").replace("]", "") data[currentkey] = [] else: data[currentkey].append(line) for key in data.keys(): data[key] = "\n".join(data[key]) for this in ["Header", "Reads", "Settings", "Data"]: if this not in data.keys(): logger.warning("%s not found in the DesignExpMiSeq file" % this) self.data = data self.df = pd.read_csv(io.StringIO(data["Data"])) ncols = [8, 9, 10, 12] if self.df.shape[1] not in ncols: self.df = pd.read_csv(io.StringIO(data["Data"]), ";") if self.df.shape[1] not in ncols: logger.warning( "Data section must have 10 or 12 columns. Check the samplesheet" ) # Fixes https://github.com/sequana/sequana/issues/507 self.df["Sample_ID"] = self.df["Sample_ID"].astype(str) self.df.rename(columns={ "I7_Index_ID": "Index1_ID", "index": "Index1_Seq", "I5_Index_ID": "Index2_ID", "index2": "Index2_Seq" }, inplace=True)
def read(self, filename, tag): if filename.endswith(".csv"): data = pd.read_csv(filename) data = data.query("reference_name not in [-1, '-1']") else: sam = PacbioIsoSeqMappedIsoforms(filename) mapped, data = sam.plot_sirv_by_group(tag) self.rawdata.append(data) self.labels.append(tag)
def __init__(self, filename, sep="\s*,\s*", reference=None): self.filename = filename # \s to strip the white spaces self.df = pd.read_csv(filename, sep=sep, engine="python", comment="#", dtype={"label": str}) self.reference = reference
def hist_read_length(self, bins=100, fontsize=16): """ """ filename = self.getfile("correction/*gkpStore/reads.txt") df = pd.read_csv(filename, header=None, sep="\t") df[2].hist(bins=bins) df.columns = ["ID", 1, "read_length", 3, 4] pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.xlim([0, pylab.xlim()[1]]) return df
def hist_read_length2(self, fontsize=16): df = pd.read_csv(self.getfile( "correction/2-correction/*.original-expected-corrected-length.dat" ), sep="\t", header=None) pylab.clf() df[1].hist(bins=100, alpha=0.5, density=True, label="original") df[2].hist(bins=100, alpha=0.5, density=True, label="expected") df[3].hist(bins=100, alpha=0.5, density=True, label="corrected") pylab.legend() pylab.xlabel("read length", fontsize=fontsize) pylab.ylabel("number of reads ", fontsize=fontsize) return df
def __init__(self, filename, sep="\t", **kwargs): self.df = pd.read_csv(filename, sep=sep, **kwargs) # If there is a header, let us strip it from spaces try: self.df.columns = [x.strip() for x in self.df.columns] except: pass # let us strip strings from spaces for x in self.df.columns: try: self.df[x] = self.df[x].str.strip() except: pass
def _get_table(self, pattern): """ Extract the complete (with all comparisons) table from a RNADiff analysis or the normCounts table depending on the pattern specified. """ if pattern: table_files = [f for f in self._table_folder.glob(pattern)] if len(table_files) == 0: raise ValueError("Found no file for your pattern {}".format(pattern)) elif len(table_files) != 1: print(table_files) raise ValueError("Found more than 1 file with the pattern {}".format(pattern)) return pd.read_csv(table_files[0], self.SEP, index_col=0) else: table_files = [f for f in self._table_folder.glob("*.xls")] table = [f for f in table_files if re.match(pattern, str(f))] if len(table) == 1 and table[0].is_file(): return pd.read_csv(table[0], self.SEP, index_col=0) else: raise IOError( f"Cannot find a single proper table with pattern: {pattern} from RNADiff: {table}" )
def __init__(self, filename): self.filename = filename self.df = pd.read_csv(filename, sep="\t", header=None) self.df.columns = [ "type", "label", "size", "3", "4", "5", "6", "7", "8" ] name = self.df['label'].apply(lambda x: x.rsplit(":", 1)[0]) start = self.df['label'].apply( lambda x: x.rsplit(":", 1)[1].split("-")[0]) end = self.df['label'].apply( lambda x: x.rsplit(":", 1)[1].split("-")[1]) self.df['start'] = start.astype(int) self.df['end'] = end.astype(int) self.df['name'] = name
def _get_df_with_taxon(self, dbname): # line 14500 # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index]) df['count'] = self.taxons.values df.reset_index(inplace=True) newrow = len(df) df.ix[newrow] = "Unclassified" df.ix[newrow, 'count'] = self.unclassified df.ix[newrow, 'index'] = -1 df.rename(columns={"index": "taxon"}, inplace=True) df["percentage"] = df["count"] / df["count"].sum() * 100 # Now get back all annotations from the database itself. filename = dbname + os.sep + "annotations.csv" if os.path.exists(filename): annotations = pd.read_csv(filename) annotations.set_index("taxon", inplace=True) df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']] # There are duplicates sohow. let us keep the first one for now df2 = df2.reset_index().drop_duplicates( subset="taxon", keep="first").set_index("taxon") self.df2 = df2 self.df1 = df.set_index("taxon") df = pd.merge(self.df1, df2, left_index=True, right_index=True) df.reset_index(inplace=True) starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count'] df = df[starter + [ x for x in df.columns if x not in starter and x != "description" ] + ["description"]] df['gi'] = [int(x) for x in df['gi'].fillna(-1)] from easydev import precision df['percentage'] = [str(precision(x, 2)) for x in df['percentage']] else: starter = ['taxon', 'count', 'percentage'] df = df[starter + [x for x in df.columns if x not in starter]] df.sort_values(by="percentage", inplace=True, ascending=False) return df
def plot_kmer(self, bins=100): filename = self.getfile("correction/0-mercounts/*.ms16.histogram") df = pd.read_csv(filename, header=None, sep="\t") df.columns = ["kmer", "count", "X", "Y"] # Save some data self.data['correction']['largest mercount'] = list(df['kmer'])[-1] self.data['correction']['unique mers'] = df['count'][0] self.data['correction']['distinc mers'] = df['count'].sum() self.data['correction'][""] = sum(df.kmer * df['count']) # X is just df['count'].cumsum() / df['count'].sum() (distinct kmer) # Y is (df['kmer']*df['count']).cumsum() / (df['kmer']*df['count()).sum() # that is total kmer pylab.plot(df.X, df.Y, label="distinct vs total") pylab.grid() pylab.legend() return df
def __init__(self, path, alpha=0.05, log2_fc=0, sep=",", condition="condition"): """ A representation of the results of a single rnadiff comparison """ self.path = Path(path) self.name = self.path.stem.replace("_degs_DESeq2", "").replace("-", "_") self._alpha = alpha self._log2_fc = log2_fc self.df = pd.read_csv(self.path, index_col=0, sep=sep) self.condition = condition self.filt_df = self.filter() self.set_gene_lists()
def _get_df_with_taxon(self, dbname): # line 14500 # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index]) df['count'] = self.taxons.values df.reset_index(inplace=True) newrow = len(df) df.ix[newrow] = "Unclassified" df.ix[newrow, 'count'] = self.unclassified df.ix[newrow, 'index'] = -1 df.rename(columns={"index":"taxon"}, inplace=True) df["percentage"] = df["count"] / df["count"].sum() * 100 # Now get back all annotations from the database itself. filename = dbname + os.sep + "annotations.csv" if os.path.exists(filename): annotations = pd.read_csv(filename) annotations.set_index("taxon", inplace=True) df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']] # There are duplicates sohow. let us keep the first one for now df2 = df2.reset_index().drop_duplicates(subset="taxon", keep="first").set_index("taxon") self.df2 = df2 self.df1 = df.set_index("taxon") df = pd.merge(self.df1, df2, left_index=True, right_index=True) df.reset_index(inplace=True) starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count'] df = df[starter + [x for x in df.columns if x not in starter and x!="description"] + ["description"]] df['gi'] = [int(x) for x in df['gi'].fillna(-1)] from easydev import precision df['percentage'] = [str(precision(x,2)) for x in df['percentage']] else: starter = ['taxon', 'count', 'percentage'] df = df[starter + [x for x in df.columns if x not in starter]] df.sort_values(by="percentage", inplace=True, ascending=False) return df
def __init__(self, filename, alpha=0.05, log2_fc=0, pattern="*complete*xls", sep="\t"): """.. rubric:: constructor :param rnadiff_results: can be a folder for a simple comparison, or an output file containing the rseults of a rnadiff analysis :param alpha: :param out_dir: :param log2_fc: the log2 fold change to apply """ if os.path.isdir(filename): filenames = glob.glob(filename + "/tables/" + pattern) if len(filenames) == 1: self.filename = filenames[0] else: raise IOError("Found several file with the {} pattern. Please be" "more restrictive using the pattern argument") elif os.path.exists(filename): # must be a 'complete' file from rnadiff analysis self.filename = filename else: raise TypeError("{} does not exists".format(filename)) # Finally, read the data itself self.df = pd.read_csv(self.filename, sep, index_col=0) # Just an alias to a subset of the dataframe normed = [x for x in self.df.columns if x.startswith('norm')] self.normcounts = self.df[normed] # some parameters/attributes self._alpha = alpha self._log2_fc = log2_fc # What are the sample names and condition names self.sample_names = [x.replace('norm.', '') for x in normed] self.condition_names = set([x[0:-1] for x in self.sample_names]) self.set_colors() self._set_gene_lists_one_condition()
def get_taxons_from_gis(self, gis, filename="gi_taxid_nucl.dmp"): filename = self.taxon_path + os.sep + filename data = pd.read_csv(filename, chunksize=1000000, sep='\t', header=None) N = 560 # with time this number will be deprecated but good for now local_gis = gis[:] # We will found GI an order than different from the input gis list so # we will need to keep track of the order found_gis = [] taxons = [32644] * len(gis) # 32644 means unidentified # we search for the unique gis. Once found, we remove them from the # vector and keep going until the vector is empty or there is no more # chunks. A good sanity check is that the final gis vector should be # empty meaning all have been found. We do not care about the order # of the final taxons vector as compare to the GI vector print("Scanning %s to look for %s GI numbers" % (filename, len(gis))) pb = Progress(N) for i, chunk in enumerate(data): chunk.set_index(0, inplace=True) chunk = chunk.ix[local_gis].dropna() # keep the GI and Taxon found_gis.extend([int(x) for x in list(chunk.index)]) # update the remaining GIs and the taxons for gi, tax in zip(chunk.index, chunk.values): local_gis.remove(gi) index = gis.index(gi) taxons[index] = tax # no need to carry on if all GIs were found if len(local_gis) == 0: break pb.animate(i + 1) print("") taxons = [int(x) for x in taxons] return taxons
def __init__(self, directory=".", prefix="job-*"): self.prefix = prefix self.directory = directory # low quality isoforms self.lq_isoforms = self.get_file("lq_isoforms.fastq") if self.lq_isoforms: self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms self.hq_isoforms = self.get_file("hq_isoforms.fastq") if self.hq_isoforms: self.hq_sequence = FastQ(self.hq_isoforms) # General info self.csv = self.get_file("-file.csv") if self.csv: self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") self.ccs = self.get_file("ccs.fasta", noprefix=True) if self.ccs: self.ccs = FastA(self.ccs)
def _get_histogram_data(self): """In cutadapt logs, an adapter section contains an histogram of matches that starts with a header and ends with a blank line """ header = 'length\tcount\texpect\tmax.err\terror counts\n' with open(self.input_filename, 'r') as fin: # not too large so let us read everything data = fin.readlines() scanning_histogram = False adapters = [] current_hist = header dfs = {} if "variable 5'/3'" in "\n".join(data): cutadapt_mode = "b" else: cutadapt_mode = "other" for this in data: # while we have not found a new adapter histogram section, # we keep going # !! What about 5' / 3' if this.startswith("==="): if 'read: Adapter' in this: # We keep read: Adatpter because it may be the first # or second read so to avoid confusion we keep the full # name for now. name = this.replace("First read: Adapter ", "R1_") name = name.replace("Second read: Adapter ", "R2_") name = name.strip().strip("===") name = name.strip() elif "=== Adapter" in this: name = this.split("=== ")[1].split(" ===")[0] name = name.strip() else: pass if scanning_histogram is False: if this == header: # we found the beginning of an histogram scanning_histogram = True else: # we are somewhere in the log we do not care about pass elif scanning_histogram is True and len(this.strip()) != 0: # accumulate the histogram data current_hist += this elif scanning_histogram is True and len(this.strip()) == 0: # we found the end of the histogram # Could be a 5'/3' case ? if so another histogram is # possible df = pd.read_csv(io.StringIO(current_hist), sep='\t') #reinitiate the variables if cutadapt_mode != "b": dfs[name] = df.set_index("length") current_hist = header scanning_histogram = False else: # there will be another histogram so keep scanning current_hist = header # If we have already found an histogram, this is # therefore the second here. if name in dfs: if len(df): dfs[name] = dfs[name].append(df.set_index("length")) scanning_histogram = False dfs[name] = dfs[name].reset_index().groupby("length").aggregate(sum) else: dfs[name] = df.set_index("length") scanning_histogram = True else: pass return dfs
################################ IMPORT ################################################################################################ import sys from sequana.lazy import pandas as pd from Bio import SeqIO ################################ PARAMETERS ################################################################################################ file_input = str(sys.argv[1]) col_name_pos = str(sys.argv[2]) file_output = str(sys.argv[3]) file_genbank = str(sys.argv[4]) ################################ INPUT DATA ################################################################################################ df = pd.read_csv(file_input, sep=",") df = df.sort_values(by=col_name_pos, ascending=True) seq_records = SeqIO.parse(file_genbank, "genbank") record = next(seq_records) # for each position, check if there is CDS annotation rec_i = 0 b = 0 e = 0 # init table all results result_annot = [] header_df_results = [ "CDS", " start", " end", " strand", " gene_ID", " gene_name", " product", " note"
def __init__(self, filename): self.filename = filename self.df = pd.read_csv(filename, header=None, sep="\t") self.coverage_column = 12
def read(self): """Read a CSV file""" self.df = pd.read_csv(self.filename, sep=self.sep)
alpha = 0.6 nb_bars = 15 figsize = (14, 8) max_chars = 60 fontsize = 24 if save_output: title = filename_output.replace(".png", "").replace("_", " ").replace(".txt", "") else: title = f_input.replace("fof_", "").replace(".txt", "") ################################ INPUT DATA ############################################################################################## df = pd.read_csv(f_input) ################################ EXECUTE ############################################################################################## names_prod = collections.Counter(df.loc[:, "product"]) list_names = [] list_count = [] for k in names_prod.keys(): if (str(k) != "nan") & (k != "None"): list_names.append(str(k)) list_count.append(int(names_prod[k])) # print(names_prod[k]) list_names_sort = [ n for (c, n) in sorted(zip(list_count, list_names), reverse=True)
def __init__(self, filename): super(ExpDesignMiSeq, self).__init__(filename) self.name = "ExpDesignMiSeq" data = {} # shlex removes all white lines and split by return carriage # strip is also applied rawdata = shlex.split(open(filename, "r")) for line in rawdata: # sometimes, IEM will store the ;;; at the end # so we can get [HEADER];;;;;;;;;;; if line.startswith('[') and "]" in line: line = line.strip(";").strip(",").strip() currentkey = line.replace("[", "").replace("]", "") data[currentkey] = [] else: data[currentkey].append(line) for key in data.keys(): data[key] = "\n".join(data[key]) for this in ["Header", "Reads", "Settings", "Data"]: if this not in data.keys(): logger.warning("%s not found in the DesignExpMiSeq file" % this) self.data = data self.df = pd.read_csv(io.StringIO(data["Data"])) if self.df.shape[1] not in [8, 10]: self.df = pd.read_csv(io.StringIO(data["Data"]), ";") if self.df.shape[1] not in [8, 10]: logger.warning("Data section must have 8 or 10 columns. Check the samplesheet") # Fixes https://github.com/sequana/sequana/issues/507 self.df["Sample_ID"] = self.df["Sample_ID"].astype(str) self.df.rename(columns={"I7_Index_ID":"Index1_ID", "index":"Index1_Seq", "I5_Index_ID": "Index2_ID", "index2":"Index2_Seq"}, inplace=True) # The name of the Index_ID is not standard.... # Depends on the experimentalist because a prefix may be added. # One known prefix is NF. We agreed that future prefix must end with an # underscore so that it can be removed. Since ID may contain letters # (e.g.S501), it would be impossible otherwise to split the prefix from # the index. self.df["Index1_ID"] = self.df["Index1_ID"].apply( lambda x: x.replace("NF", "")) self.df["Index1_ID"] = self.df["Index1_ID"].apply( lambda x: x.split("_",1)[-1]) try: self.df["Index1_ID"] = self.df["Index1_ID"].astype(int) except: pass if "Index2_ID" in self.df.columns: self.df["Index2_ID"] = self.df["Index2_ID"].apply( lambda x: x.replace("NF", "")) self.df["Index2_ID"] = self.df["Index2_ID"].apply( lambda x: x.split("_",1)[-1]) try: self.df["Index2_ID"] = self.df["Index2_ID"].astype(int) except: pass # Figure out the type of adapters if possible try: header = self.data['Header'] assay = [x for x in header.split('\n') if x.startswith("Assay")] assay = assay[0] items = assay.split(',') if items[0] == "Assay": self.adapter_type = assay.split(",")[1] else: items = assay.split(';') self.adapter_type = assay.split(";")[1] except: pass self.check()