def __init__(self, filename, verbose=False): if filename.endswith(".gz"): raise ValueError("Must be decompressed.") self._fasta = FastxFile(filename) self.filename = filename logger.info("Reading input fasta file...please wait") self._N = len([x for x in FastxFile(filename)])
def __init__(self, input_files: Union[str, list], outq: multiprocessing.Queue, read_buffer: int = 100000, read_counter: multiprocessing.Manager().Value = None, n_subprocesses: int = 1, statq: multiprocessing.Queue = None) -> None: # Input variables self.input_files = input_files self._multifile = self._is_multifile(input_files) if self._multifile: self._input_files_pysam = [FastxFile(f) for f in self.input_files] else: self._input_files_pysam = [FastxFile(self.input_files), ] # Multiprocessing variables self.outq = outq self.statq = statq self.n_subprocesses = n_subprocesses # Reader variables self.read_buffer = read_buffer self.read_counter = read_counter super(FastqReaderProcess, self).__init__()
def readfx(fastx): if not file_exists(fastx): logger.critical("File Not Found: %s" % fastx) raise IOError(2, "No such file:", fastx) fx = "" try: fx = FastxFile(fastx) for f in fx: yield f.name, f.sequence, f.quality finally: if fx: fx.close()
def next(self): # python 2 # reads 4 lines try: d = next(self._fasta) return d except KeyboardInterrupt: #pragma: no cover # This should allow developers to break a loop that takes too long # through the reads to run forever self._fasta.close() self._fasta = FastxFile(self._fasta.filename) except: self._fasta.close() self._fasta = FastxFile(self._fasta.filename) raise StopIteration
def next(self): # python 2 # reads 4 lines try: d = next(self._fasta) return d except KeyboardInterrupt: # This should allow developers to break a loop that takes too long # through the reads to run forever self._fasta.close() self._fasta = FastxFile(self._fasta.filename) except: self._fasta.close() self._fasta = FastxFile(self._fasta.filename) raise StopIteration return d
def split_fastx(fname, output, chunksize=10000): """Split records in a fasta/q into fixed lengths. :param fname: input filename. :param output: output filename. :param chunksize: (maximum) length of output records. """ with open(output, 'w') as fout: with FastxFile(fname, persist=False) as fin: for rec in fin: name = rec.name seq = rec.sequence qual = rec.quality if rec.comment is None: comment = 'chunk_length={}'.format(chunksize) else: comment = '{} chunk_length={}'.format(rec.comment, chunksize) if qual is None: for i, s in enumerate(chunks(seq, chunksize)): chunk_name = '{}_chunk{}'.format(name, i) fout.write(">{} {}\n{}\n".format( chunk_name, comment, ''.join(s))) else: for i, (s, q) in enumerate(zip(chunks(seq, chunksize), chunks(qual, chunksize))): chunk_name = '{}_chunk{}'.format(name, i) fout.write('@{} {}\n{}\n+{}\n'.format( chunk_name, comment, ''.join(s), ''.join(q)))
def _base_content(filename, window_size, letters, circular=False): # DOC: see gc_content fasta = FastxFile(filename) checker = set(letters) chrom_gc_content = dict() for chrom in fasta: mid = int(window_size / 2) # Create gc_content array gc_content = np.empty(len(chrom.sequence)) gc_content[:] = np.nan if circular: chrom.sequence = (chrom.sequence[-mid:] + chrom.sequence + chrom.sequence[:mid]) # Does not shift index of array mid = 0 # Count first window content counter = Counter(chrom.sequence[0:window_size]) gc_count = 0 for letter in letters: gc_count += counter[letter] gc_content[mid] = gc_count for i in range(1, len(chrom.sequence) - window_size + 1): if chrom.sequence[i - 1] in checker: gc_count -= 1 if chrom.sequence[i + window_size - 1] in checker: gc_count += 1 gc_content[i + mid] = gc_count chrom_gc_content[chrom.name] = gc_content / window_size return chrom_gc_content
def parseFasta(filepath: str, ref_dict: dict, target1: str, target2: str) -> dict: fasta = FastxFile(filepath) for ref in fasta: if target1 == ref.name or target2 == ref.name: ref_dict[ref.name] = ref.sequence return ref_dict
def run_fastq_qc(fastq_path, output): qualities = list() mean_qualities = list() lengths = list() with FastxFile(fastq_path) as fq: for rec in fq: # ONT calculation for "mean Q score" quals = np.fromiter((ord(x) - 33 for x in rec.quality), dtype=int, count=len(rec.quality)) mean_p = np.mean(np.power(10, quals / -10)) mean_qualities.append(-10 * np.log10(mean_p)) # all qualities qualities.extend(quals) lengths.append(len(quals)) with open(os.path.join(output, "base_qual.txt")) as f: f.write("\n".join((str(q) for q in qualities))) with open(os.path.join(output, "read_qual.txt")) as f: f.write("\n".join((str(q) for q in mean_qualities))) with open(os.path.join(output, "lengths.txt")) as f: f.write("\n".join((str(l) for l in lengths)))
def _method_pysam(self, quality_file=None, *args, **kwargs): from pysam import FastxFile if quality_file is None: _log.warning("No quality file provided. Please use --quality-file") with open(self.outfile, 'w') as fastq_out: for seq in FastxFile(self.infile): fastq_out.write("@{0} {1}\n{2}\n+\n{3}\n".format( seq.name, seq.comment, seq.sequence, len(seq.sequence) * "I")) else: # length must be equal and identifiers sorted similarly with open(self.outfile, "w") as fastq_out: for seq, qual in zip(FastxFile(self.infile), FastxFile(quality_file)): assert seq.name == qual.name fastq_out.write("@{0} {1}\n{2}\n+\n{3}\n".format( seq.name, seq.comment, seq.sequence, qual.sequence))
def select_random_reads(self, N=None, output_filename="random.fasta"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. :param str output_filename: """ import numpy as np thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fasta = FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fasta): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i + 1) return cherries
def _method_pysam(self, *args, **kwargs): from pysam import FastxFile if self.infile[1] is None: _log.error( "No quality file provided. Please add a quality file path ") sys.exit(1) else: # length must be equal and identifiers sorted similarly with open(self.outfile, "w") as fastq_out: for seq, qual in zip(FastxFile(self.infile[0]), FastxFile(self.infile[1])): assert seq.name == qual.name if seq.comment: fastq_out.write("@{0} {1}\n{2}\n+\n{3}\n".format( seq.name, seq.comment, seq.sequence, qual.sequence)) else: fastq_out.write("@{0}\n{1}\n+\n{2}\n".format( seq.name, seq.sequence, qual.sequence))
def get_run_index(self, fout: bool = False, sort_by: str = 'start_time'): self.fastx_index = pandas.DataFrame( [extract_read_data(read) for read in FastxFile(self.fastx)]) if sort_by: self.fastx_index = self.fastx_index.sort_values(sort_by) if fout: self.fastx_index.to_csv(fout, sep='\t', index=False) return self.fastx_index
def fx_filter(fpath, ids, output, column, sep): """ Filter reads by external file of read header names """ ids_df = pandas.read_csv(ids, sep=sep, header=None) read_ids = set( [str(read_id) for read_id in ids_df.iloc[:, column].tolist()]) with FastxFile(fpath) as fin, \ get_output_handle(output) as fout: for read in fin: if read.name in read_ids: fout.write(str(read) + "\n")
def readfx(fastx): """FASTX file reader. Args: fastx (str): file path to fasta or fastq file; supports gzip compressed files Yields: tuple: The tuple consists of the read name, the sequence, and quality scores if the file is a fastq. Raises: IOError: If `fastx` file does not exist. """ if not os.path.exists(fastx): raise IOError(2, "No such file:", fastx) fx = "" try: fx = FastxFile(fastx) for f in fx: yield f.name, f.sequence, f.quality finally: if fx: fx.close()
def explain_report(filtered_analysis, sequencefile, min_repeats, jobs=1): """Calculate fraction of reads explainable by each motif""" explained_analysis = filtered_analysis.copy() explained_analysis["bases_explained"], total_bases = 0.0, 0 with FastxFile(sequencefile) as fastx: def get_number_of_masked_positions(sequence, motifs): n_masked_positions_per_motif = {} for motif in motifs: positions_to_mask = set() motifs_pattern = get_circular_pattern( motif, repeats=min_repeats, ) matcher = motifs_pattern.finditer(sequence, overlapped=True) for match in matcher: positions_to_mask |= set(range(match.start(), match.end())) n_masked_positions_per_motif[motif] = len(positions_to_mask) return n_masked_positions_per_motif, len(sequence) with ThreadPoolExecutor(max_workers=jobs) as pool: workers = [ pool.submit( get_number_of_masked_positions, entry.sequence, set(filtered_analysis["motif"]), ) for entry in fastx ] iterator = progressbar( as_completed(workers), total=len(workers), desc="Calculating fractions", unit="read", ) for worker in iterator: n_masked_positions_per_motif, total_seq_bases = worker.result() for motif, n_pos in n_masked_positions_per_motif.items(): indexer = ( explained_analysis["motif"] == motif, "bases_explained", ) explained_analysis.loc[indexer] += n_pos total_bases += total_seq_bases return explained_analysis, total_bases
def main(references: List[str], accession2taxid: str, output: str): """Write out a tsv file mapping reference names to taxids.""" output_tsv = open(output, 'w') accession2taxid_df = pd.read_csv(accession2taxid, sep='\t', header=0, index_col=1) for ref in references: with FastxFile(ref) as fh: for entry in fh: try: taxid = accession2taxid_df.at[entry.name, 'taxid'] except KeyError: print("Error: couldn't find taxid for {}".format( entry.name)) sys.exit(1) output_tsv.write('{name}\t{taxid}\n'.format(name=entry.name, taxid=taxid))
def get_output_handle(fpath: str, fastx: bool = False, out: bool = True): if fpath == "-": if out: handle = sys.stdout else: handle = sys.stdin else: p = Path(fpath) if not p.parent.is_dir(): raise NotADirectoryError( "Directory specified for output file does not exist: {}". format(p.parent)) if fastx: handle = FastxFile(p) else: handle = p.open("w") return handle
def _open_dataset(self): return FastxFile(self._urlpath)
class FastA(object): """Class to handle FastA files. Cannot be compressed """ def __init__(self, filename, verbose=False): if filename.endswith(".gz"): raise ValueError("Must be decompressed.") self._fasta = FastxFile(filename) self.filename = filename logger.info("Reading input fasta file...please wait") self._N = len([x for x in FastxFile(filename)]) def __iter__(self): return self def __next__(self): # python 3 return self.next() def next(self): # python 2 # reads 4 lines try: d = next(self._fasta) return d except KeyboardInterrupt: # This should allow developers to break a loop that takes too long # through the reads to run forever self._fasta.close() self._fasta = FastxFile(self._fasta.filename) except: self._fasta.close() self._fasta = FastxFile(self._fasta.filename) raise StopIteration return d def __len__(self): return self._N def _get_names(self): return [this.name for this in self] names = property(_get_names) def _get_sequences(self): return [this.sequence for this in self] sequences = property(_get_sequences) def _get_comment(self): return [this.comment for this in self] comments = property(_get_comment) def _get_lengths(self): return [len(this.sequence) for this in self] lengths = property(_get_lengths) def get_lengths_as_dict(self): return dict(zip(self.names, self.lengths)) def format_contigs_denovo(self, output_file, len_min=500): """Replace NODE with the project name and remove contigs with a length lower than len_min. :param str output_file: output file name. :param int len_min: minimal length of contigs. Example: from sequana import FastA contigs = FastA("denovo_assembly.fasta") contigs.format_contigs_denovo("path/to/file.fasta", len_min=500) Results are stored in "path/to/file.fasta". """ # catch basename of file without extension project = os.path.basename(output_file).split(".")[0] # check if directory exist output_dir = os.path.dirname(output_file) try: if not os.path.exists(output_dir): os.makedirs(output_dir) except FileNotFoundError: pass n = 1 with open(output_file, "w") as fp: for contigs in self: if len(contigs.sequence) < len_min: break name = ">{}_{} {}\n".format(project, n, contigs.name) sequence = "\n".join([contigs.sequence[i:min(i+80, len(contigs.sequence))] for i in range(0, len(contigs.sequence), 80)]) + "\n" fp.write(name + sequence) n += 1 def select_random_reads(self, N=None, output_filename="random.fasta"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. :param str output_filename: """ import numpy as np thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fasta = FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fasta): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i+1) return cherries def get_stats(self): stats = {} stats["N"] = 2 stats["mean_length"] = mean(self.lengths) return stats
logger.warning(f"Sample {sample} is not in the VCF [skipping]") else: s.add(sample) if len(s) == 0: logger.warning( "No valid samples found in {} - using all samples in VCF", samples_fname ) else: samples = s else: logger.info("Using all samples in VCF") logger.info(f"Loaded {len(samples)} samples") logger.info("Determining which strand each gene is on...") strand: Dict[str, str] = dict() for entry in FastxFile(fasta_ref): for field in entry.comment.rstrip().split(): if field.startswith("strand"): strand[entry.name] = field[7] if entry.name not in strand: raise ValueError(f"Couldn't find strand for {entry.name}") logger.info("Extracting consensus sequences with bcftools consensus...") with TemporaryDirectory() as tmpdirname: for sample in samples: outname = Path(tmpdirname) / f"{sample}.fa" args = ( "bcftools", "consensus", "-s", sample,
def get_seq_lens(fastx): """Get sequence lengths from fastx file""" return [len(r.sequence) for r in FastxFile(fastx)]
def sequence_cleaner(fasta_q_file, min_length=0, percentage_n=100.0, concatenate_duplicates=True, remove_ambiguous=False): """Read FASTA/FASTQ file and clean the file. Args: fasta_q_file (str): Path to FASTA/Q file. min_length (str): Minimum length allowed (default=0 - allows all the lengths). percentage_n (float): % of N is allowed (default=100). concatenate_duplicates (bool): Remove duplicate and keep one sequence (default=True). remove_ambiguous (bool): Remove any sequence with an ambiguous base (default=False). Returns: collections.defaultdict: Hash with clean sequences. int: # Sequences Processed. int: # Repeated Sequences. int: # Repeated Sequences (Reverse Complement). int: # Short Sequences. int: # High N Sequences. """ hash_sequences = defaultdict(list) total_sequences_processed = 0 total_repeated_sequences = 0 total_repeated_sequences_rc = 0 total_short_sequences = 0 total_high_n_sequences = 0 with FastxFile(fasta_q_file) as fh: for entry in fh: total_sequences_processed += 1 sequence_id = entry.name sequence = entry.sequence.upper() found_ambiguous = False if remove_ambiguous: for base in sequence: # found ambiguous base. Sequence is skipped if base in AMBIGUOUS_BASES: found_ambiguous = True break if not found_ambiguous: # remove sequences that are shorter or equal to `min_length` if len(sequence) <= min_length: total_short_sequences += 1 continue # remove sequences that do noot meet the % N elif (float(sequence.count("N")) / float(len(sequence))) * 100 > percentage_n: total_high_n_sequences += 1 continue elif concatenate_duplicates: # repeated sequence - add sequence ID to hash if sequence in hash_sequences: hash_sequences[sequence].append(sequence_id) total_repeated_sequences += 1 else: rc = reverse_complement(sequence) # check if reverse complement is already in hash # if so, add modified ID and flags that the sequence reverse complement was repeated if rc in hash_sequences: hash_sequences[rc].append("{}_RC".format(sequence_id)) total_repeated_sequences += 1 total_repeated_sequences_rc += 1 # if not, it means it was the first time the sequence was seen - add it to hash else: hash_sequences[sequence].append(sequence_id) else: hash_sequences[sequence_id].append(sequence) return (hash_sequences, total_sequences_processed, total_repeated_sequences, total_repeated_sequences_rc, total_short_sequences, total_high_n_sequences)
class FastA(object): """Class to handle FastA files. Cannot be compressed """ def __init__(self, filename, verbose=False): if filename.endswith(".gz"): #pragma: no cover raise ValueError("Must be decompressed.") self._fasta = FastxFile(filename) self.filename = filename self._N = None def __iter__(self): return self def __next__(self): # python 3 return self.next() def next(self): # python 2 # reads 4 lines try: d = next(self._fasta) return d except KeyboardInterrupt: #pragma: no cover # This should allow developers to break a loop that takes too long # through the reads to run forever self._fasta.close() self._fasta = FastxFile(self._fasta.filename) except: self._fasta.close() self._fasta = FastxFile(self._fasta.filename) raise StopIteration def __len__(self): if self._N is None: logger.info("Reading input fasta file...please wait") self._N = len([x for x in FastxFile(self.filename)]) return self._N def _get_names(self): return [this.name for this in self] names = property(_get_names) def _get_sequences(self): return [this.sequence for this in self] sequences = property(_get_sequences) def _get_comment(self): return [this.comment for this in self] comments = property(_get_comment) def _get_lengths(self): return [len(this.sequence) for this in self] lengths = property(_get_lengths) def get_lengths_as_dict(self): return dict(zip(self.names, self.lengths)) def format_contigs_denovo(self, output_file, len_min=500): """Replace NODE with the project name and remove contigs with a length lower than len_min. :param str output_file: output file name. :param int len_min: minimal length of contigs. Example: from sequana import FastA contigs = FastA("denovo_assembly.fasta") contigs.format_contigs_denovo("path/to/file.fasta", len_min=500) Results are stored in "path/to/file.fasta". """ # catch basename of file without extension project = os.path.basename(output_file).split(".")[0] # check if directory exist output_dir = os.path.dirname(output_file) try: if not os.path.exists(output_dir): #pragma: no cover os.makedirs(output_dir) except FileNotFoundError: #pragma: no cover pass n = 1 with open(output_file, "w") as fp: for contigs in self: if len(contigs.sequence) < len_min: break name = ">{}_{} {}\n".format(project, n, contigs.name) sequence = "\n".join([ contigs.sequence[i:min(i + 80, len(contigs.sequence))] for i in range(0, len(contigs.sequence), 80) ]) + "\n" fp.write(name + sequence) n += 1 def filter(self, output_filename, names_to_keep=None, names_to_exclude=None): if names_to_exclude is None and names_to_keep is None: #pragma: no cover logger.warning("No ids provided") return if names_to_exclude: with open(self.filename) as fin: with open(output_filename, "w") as fout: skip = False # do no use readlines. may be slower but may cause memory # issue for line in fin: if line.startswith(">"): if line[1:].split()[0] in names_to_exclude: skip = True else: skip = False if skip is False: fout.write(line) elif names_to_keep: with open(self.filename) as fin: with open(output_filename, "w") as fout: # do no use readlines. may be slower but may cause memory # issue skip = True for line in fin: if line.startswith(">"): if line[1:].split()[0] in names_to_keep: skip = False else: skip = True if skip is False: fout.write(line) def select_random_reads(self, N=None, output_filename="random.fasta"): """Select random reads and save in a file :param int N: number of random unique reads to select should provide a number but a list can be used as well. :param str output_filename: """ import numpy as np thisN = len(self) if isinstance(N, int): if N > thisN: N = thisN # create random set of reads to pick up cherries = list(range(thisN)) np.random.shuffle(cherries) # cast to set for efficient iteration cherries = set(cherries[0:N]) elif isinstance(N, set): cherries = N elif isinstance(N, list): cherries = set(N) fasta = FastxFile(self.filename) pb = Progress(thisN) # since we scan the entire file with open(output_filename, "w") as fh: for i, read in enumerate(fasta): if i in cherries: fh.write(read.__str__() + "\n") else: pass pb.animate(i + 1) return cherries def get_stats(self): from pylab import mean stats = {} stats["N"] = len(self.sequences) stats["mean_length"] = mean(self.lengths) stats["total_length"] = sum(self.lengths) from sequana.stats import N50, L50 stats["N50"] = N50(self.lengths) stats["L50"] = L50(self.lengths) stats["min_length"] = min(self.lengths) stats["max_length"] = max(self.lengths) return stats def summary(self, max_contigs=-1): from pylab import mean, argmax # used by sequana summary fasta summary = {"number_of_contigs": len(self.sequences)} summary["total_contigs_length"] = sum(self.lengths) summary["mean_contig_length"] = mean(self.lengths) summary["max_contig_length"] = max(self.lengths) summary["min_contig_length"] = min(self.lengths) N = 0 lengths = self.lengths[:] positions = list(range(len(lengths))) stats = self.get_stats() print("#sample_name: {}".format(self.filename)) print("#total length: {}".format(stats['total_length'])) print("#N50: {}".format(stats['N50'])) print("#Ncontig: {}".format(stats['N'])) print("#L50: {}".format(stats['L50'])) print("#max_contig_length: {}".format(stats['max_length'])) print("#min_contig_length: {}".format(stats['min_length'])) print("#mean_contig_length: {}".format(stats['mean_length'])) print("contig name,length,count A,C,G,T,N") if max_contigs == -1: max_contigs = len(lengths) + 1 while lengths and N < max_contigs: N += 1 index = argmax(lengths) length = lengths.pop(index) position = positions.pop(index) sequence = self.sequences[position] name = self.names[position] print("{},{},{},{},{},{},{}".format(name, length, sequence.count('A'), sequence.count('C'), sequence.count('G'), sequence.count('T'), sequence.count('N'))) def GC_content_sequence(self, sequence): GC = sequence.count('G') + sequence.count('g') GC += sequence.count('C') + sequence.count('c') return GC / len(sequence) * 100 def GC_content(self): lengths = sum(self.lengths) GC = 0 for seq in self.sequences: GC += seq.count('G') + seq.count('g') GC += seq.count('C') + seq.count('c') return GC / lengths * 100 def reverse_and_save(self, filename): with open(filename, "w") as fout: for read in self: fout.write(">{}\t{}\n{}\n".format(read.name, read.comment, read.sequence[::-1])) def save_ctg_to_fasta(self, ctgname, outname, max_length=-1): index = self.names.index(ctgname) with open("{}.fa".format(outname), "w") as fout: if max_length == -1: fout.write(">{}\n{}".format(outname, self.sequences[index])) else: fout.write(">{}\n{}".format( outname, self.sequences[index][0:max_length])) def to_fasta(self, outfile, width=80): """Save the input FastA file into a new file The interest of this method is to wrap the sequence into 80 characters. This is useful if the input file is not formatted correctly. """ with open(outfile, "w") as fout: for name, comment, seq in zip(self.names, self.comments, self.sequences): import textwrap seq = "\n".join(textwrap.wrap(seq, width)) if comment is None: fout.write(">{}\n{}\n".format(name, seq)) else: fout.write(">{}\t{}\n{}\n".format(name, comment, seq)) def to_igv_chrom_size(self, output): data = self.get_lengths_as_dict() with open(output, "w") as fout: for k, v in data.items(): fout.write("{}\t{}\n".format(k, v))
def __init__(self, filename, verbose=False): if filename.endswith(".gz"): #pragma: no cover raise ValueError("Must be decompressed.") self._fasta = FastxFile(filename) self.filename = filename self._N = None
class FastA(object): """Class to handle FastA files. Cannot be compressed """ def __init__(self, filename, verbose=False): if filename.endswith(".gz"): raise ValueError("Must be decompressed.") self._fasta = FastxFile(filename) self._N = len([x for x in FastxFile(filename)]) def __iter__(self): return self def __next__(self): # python 3 return self.next() def next(self): # python 2 # reads 4 lines try: d = next(self._fasta) return d except KeyboardInterrupt: # This should allow developers to break a loop that takes too long # through the reads to run forever self._fasta.close() self._fasta = FastxFile(self._fasta.filename) except: self._fasta.close() self._fasta = FastxFile(self._fasta.filename) raise StopIteration return d def __len__(self): return self._N def _get_names(self): return [this.name for this in self] names = property(_get_names) def _get_sequences(self): return [this.sequence for this in self] sequences = property(_get_sequences) def _get_comment(self): return [this.comment for this in self] comments = property(_get_comment) def _get_lengths(self): return [len(this.sequence) for this in self] lengths = property(_get_lengths) def format_contigs_denovo(self, output_file, len_min=500): """Replace NODE with the project name and remove contigs with a length lower than len_min. :param str output_file: output file name. :param int len_min: minimal length of contigs. Example: from sequana import FastA contigs = FastA("denovo_assembly.fasta") contigs.format_contigs_denovo("path/to/file.fasta", len_min=500) Results are stored in "path/to/file.fasta". """ # catch basename of file without extension project = os.path.basename(output_file).split(".")[0] # check if directory exist output_dir = os.path.dirname(output_file) try: if not os.path.exists(output_dir): os.makedirs(output_dir) except FileNotFoundError: pass n = 1 with open(output_file, "w") as fp: for contigs in self: if len(contigs.sequence) < len_min: break name = ">{}_{} {}\n".format(project, n, contigs.name) sequence = "\n".join([ contigs.sequence[i:min(i + 80, len(contigs.sequence))] for i in range(0, len(contigs.sequence), 80) ]) + "\n" fp.write(name + sequence) n += 1
def __len__(self): if self._N is None: logger.info("Reading input fasta file...please wait") self._N = len([x for x in FastxFile(self.filename)]) return self._N
def __init__(self, filename, verbose=False): if filename.endswith(".gz"): raise ValueError("Must be decompressed.") self._fasta = FastxFile(filename) self._N = len([x for x in FastxFile(filename)])