コード例 #1
0
ファイル: fasta.py プロジェクト: wenliangz/sequana
 def __init__(self, filename, verbose=False):
     if filename.endswith(".gz"):
         raise ValueError("Must be decompressed.")
     self._fasta = FastxFile(filename)
     self.filename = filename
     logger.info("Reading input fasta file...please wait") 
     self._N = len([x for x in FastxFile(filename)])
コード例 #2
0
    def __init__(self,
                 input_files: Union[str, list],
                 outq: multiprocessing.Queue,
                 read_buffer: int = 100000,
                 read_counter: multiprocessing.Manager().Value = None,
                 n_subprocesses: int = 1,
                 statq: multiprocessing.Queue = None) -> None:
        
        
        # Input variables
        self.input_files = input_files
        self._multifile = self._is_multifile(input_files)

        if self._multifile:
            self._input_files_pysam = [FastxFile(f) for f in self.input_files]
        else:
            self._input_files_pysam  = [FastxFile(self.input_files), ]
        
        # Multiprocessing variables
        self.outq = outq
        self.statq = statq
        self.n_subprocesses = n_subprocesses

       
        # Reader variables
        self.read_buffer = read_buffer
        self.read_counter = read_counter

        super(FastqReaderProcess, self).__init__()
コード例 #3
0
ファイル: recruit.py プロジェクト: BigelowLab/viruscope
def readfx(fastx):
    if not file_exists(fastx):
        logger.critical("File Not Found: %s" % fastx)
        raise IOError(2, "No such file:", fastx)

    fx = ""
    try:
        fx = FastxFile(fastx)
        for f in fx:
            yield f.name, f.sequence, f.quality
    finally:
        if fx:
            fx.close()
コード例 #4
0
ファイル: fasta.py プロジェクト: sequana/sequana
 def next(self):  # python 2
     # reads 4 lines
     try:
         d = next(self._fasta)
         return d
     except KeyboardInterrupt:  #pragma: no cover
         # This should allow developers to break a loop that takes too long
         # through the reads to run forever
         self._fasta.close()
         self._fasta = FastxFile(self._fasta.filename)
     except:
         self._fasta.close()
         self._fasta = FastxFile(self._fasta.filename)
         raise StopIteration
コード例 #5
0
ファイル: fasta.py プロジェクト: sequana/sequana
 def next(self): # python 2
     # reads 4 lines
     try:
         d = next(self._fasta)
         return d
     except KeyboardInterrupt:
         # This should allow developers to break a loop that takes too long
         # through the reads to run forever
         self._fasta.close()
         self._fasta = FastxFile(self._fasta.filename)
     except:
         self._fasta.close()
         self._fasta = FastxFile(self._fasta.filename)
         raise StopIteration
     return d
コード例 #6
0
def split_fastx(fname, output, chunksize=10000):
    """Split records in a fasta/q into fixed lengths.

    :param fname: input filename.
    :param output: output filename.
    :param chunksize: (maximum) length of output records.
    """
    with open(output, 'w') as fout:
        with FastxFile(fname, persist=False) as fin:
            for rec in fin:
                name = rec.name
                seq = rec.sequence
                qual = rec.quality
                if rec.comment is None:
                    comment = 'chunk_length={}'.format(chunksize)
                else:
                    comment = '{} chunk_length={}'.format(rec.comment, chunksize)
                if qual is None:
                    for i, s in enumerate(chunks(seq, chunksize)):
                        chunk_name = '{}_chunk{}'.format(name, i)
                        fout.write(">{} {}\n{}\n".format(
                            chunk_name, comment, ''.join(s)))
                else:
                    for i, (s, q) in enumerate(zip(chunks(seq, chunksize), chunks(qual, chunksize))):
                        chunk_name = '{}_chunk{}'.format(name, i)
                        fout.write('@{} {}\n{}\n+{}\n'.format(
                            chunk_name, comment, ''.join(s), ''.join(q)))
コード例 #7
0
def _base_content(filename, window_size, letters, circular=False):
    # DOC: see gc_content
    fasta = FastxFile(filename)
    checker = set(letters)
    chrom_gc_content = dict()
    for chrom in fasta:
        mid = int(window_size / 2)
        # Create gc_content array
        gc_content = np.empty(len(chrom.sequence))
        gc_content[:] = np.nan
        if circular:
            chrom.sequence = (chrom.sequence[-mid:] + chrom.sequence +
                              chrom.sequence[:mid])
            # Does not shift index of array
            mid = 0
        # Count first window content
        counter = Counter(chrom.sequence[0:window_size])
        gc_count = 0
        for letter in letters:
            gc_count += counter[letter]

        gc_content[mid] = gc_count
        for i in range(1, len(chrom.sequence) - window_size + 1):
            if chrom.sequence[i - 1] in checker:
                gc_count -= 1
            if chrom.sequence[i + window_size - 1] in checker:
                gc_count += 1
            gc_content[i + mid] = gc_count
        chrom_gc_content[chrom.name] = gc_content / window_size
    return chrom_gc_content
コード例 #8
0
def parseFasta(filepath: str, ref_dict: dict, target1: str,
               target2: str) -> dict:
    fasta = FastxFile(filepath)
    for ref in fasta:
        if target1 == ref.name or target2 == ref.name:
            ref_dict[ref.name] = ref.sequence
    return ref_dict
コード例 #9
0
def run_fastq_qc(fastq_path, output):
    qualities = list()
    mean_qualities = list()
    lengths = list()

    with FastxFile(fastq_path) as fq:
        for rec in fq:
            # ONT calculation for "mean Q score"
            quals = np.fromiter((ord(x) - 33 for x in rec.quality),
                                dtype=int,
                                count=len(rec.quality))
            mean_p = np.mean(np.power(10, quals / -10))
            mean_qualities.append(-10 * np.log10(mean_p))
            # all qualities
            qualities.extend(quals)
            lengths.append(len(quals))

    with open(os.path.join(output, "base_qual.txt")) as f:
        f.write("\n".join((str(q) for q in qualities)))

    with open(os.path.join(output, "read_qual.txt")) as f:
        f.write("\n".join((str(q) for q in mean_qualities)))

    with open(os.path.join(output, "lengths.txt")) as f:
        f.write("\n".join((str(l) for l in lengths)))
コード例 #10
0
ファイル: fasta2fastq.py プロジェクト: sinamomken/bioconvert
 def _method_pysam(self, quality_file=None, *args, **kwargs):
     from pysam import FastxFile
     if quality_file is None:
         _log.warning("No quality file provided. Please use --quality-file")
         with open(self.outfile, 'w') as fastq_out:
             for seq in FastxFile(self.infile):
                 fastq_out.write("@{0} {1}\n{2}\n+\n{3}\n".format(
                     seq.name, seq.comment, seq.sequence,
                     len(seq.sequence) * "I"))
     else:  # length must be equal and identifiers sorted similarly
         with open(self.outfile, "w") as fastq_out:
             for seq, qual in zip(FastxFile(self.infile),
                                  FastxFile(quality_file)):
                 assert seq.name == qual.name
                 fastq_out.write("@{0} {1}\n{2}\n+\n{3}\n".format(
                     seq.name, seq.comment, seq.sequence, qual.sequence))
コード例 #11
0
ファイル: fasta.py プロジェクト: sequana/sequana
    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN)  # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i + 1)
        return cherries
コード例 #12
0
    def _method_pysam(self, *args, **kwargs):
        from pysam import FastxFile
        if self.infile[1] is None:
            _log.error(
                "No quality file provided. Please add a quality file path ")
            sys.exit(1)

        else:  # length must be equal and identifiers sorted similarly
            with open(self.outfile, "w") as fastq_out:
                for seq, qual in zip(FastxFile(self.infile[0]),
                                     FastxFile(self.infile[1])):
                    assert seq.name == qual.name
                    if seq.comment:
                        fastq_out.write("@{0} {1}\n{2}\n+\n{3}\n".format(
                            seq.name, seq.comment, seq.sequence,
                            qual.sequence))
                    else:
                        fastq_out.write("@{0}\n{1}\n+\n{2}\n".format(
                            seq.name, seq.sequence, qual.sequence))
コード例 #13
0
ファイル: utils.py プロジェクト: Wytamma/sketchy
    def get_run_index(self, fout: bool = False, sort_by: str = 'start_time'):

        self.fastx_index = pandas.DataFrame(
            [extract_read_data(read) for read in FastxFile(self.fastx)])

        if sort_by:
            self.fastx_index = self.fastx_index.sort_values(sort_by)

        if fout:
            self.fastx_index.to_csv(fout, sep='\t', index=False)

        return self.fastx_index
コード例 #14
0
def fx_filter(fpath, ids, output, column, sep):
    """ Filter reads by external file of read header names """

    ids_df = pandas.read_csv(ids, sep=sep, header=None)

    read_ids = set(
        [str(read_id) for read_id in ids_df.iloc[:, column].tolist()])

    with FastxFile(fpath) as fin, \
            get_output_handle(output) as fout:
        for read in fin:
            if read.name in read_ids:
                fout.write(str(read) + "\n")
コード例 #15
0
def readfx(fastx):
    """FASTX file reader.

    Args:
        fastx (str): file path to fasta or fastq file; supports gzip compressed files

    Yields:
        tuple: The tuple consists of the read name, the sequence, and quality scores if the file
            is a fastq.

    Raises:
        IOError: If `fastx` file does not exist.

    """
    if not os.path.exists(fastx):
        raise IOError(2, "No such file:", fastx)
    fx = ""
    try:
        fx = FastxFile(fastx)
        for f in fx:
            yield f.name, f.sequence, f.quality
    finally:
        if fx:
            fx.close()
コード例 #16
0
ファイル: repeatfinder.py プロジェクト: LankyCyril/edgecase
def explain_report(filtered_analysis, sequencefile, min_repeats, jobs=1):
    """Calculate fraction of reads explainable by each motif"""
    explained_analysis = filtered_analysis.copy()
    explained_analysis["bases_explained"], total_bases = 0.0, 0
    with FastxFile(sequencefile) as fastx:

        def get_number_of_masked_positions(sequence, motifs):
            n_masked_positions_per_motif = {}
            for motif in motifs:
                positions_to_mask = set()
                motifs_pattern = get_circular_pattern(
                    motif,
                    repeats=min_repeats,
                )
                matcher = motifs_pattern.finditer(sequence, overlapped=True)
                for match in matcher:
                    positions_to_mask |= set(range(match.start(), match.end()))
                n_masked_positions_per_motif[motif] = len(positions_to_mask)
            return n_masked_positions_per_motif, len(sequence)

        with ThreadPoolExecutor(max_workers=jobs) as pool:
            workers = [
                pool.submit(
                    get_number_of_masked_positions,
                    entry.sequence,
                    set(filtered_analysis["motif"]),
                ) for entry in fastx
            ]
            iterator = progressbar(
                as_completed(workers),
                total=len(workers),
                desc="Calculating fractions",
                unit="read",
            )
            for worker in iterator:
                n_masked_positions_per_motif, total_seq_bases = worker.result()
                for motif, n_pos in n_masked_positions_per_motif.items():
                    indexer = (
                        explained_analysis["motif"] == motif,
                        "bases_explained",
                    )
                    explained_analysis.loc[indexer] += n_pos
                total_bases += total_seq_bases
    return explained_analysis, total_bases
コード例 #17
0
def main(references: List[str], accession2taxid: str, output: str):
    """Write out a tsv file mapping reference names to taxids."""
    output_tsv = open(output, 'w')
    accession2taxid_df = pd.read_csv(accession2taxid,
                                     sep='\t',
                                     header=0,
                                     index_col=1)

    for ref in references:
        with FastxFile(ref) as fh:
            for entry in fh:
                try:
                    taxid = accession2taxid_df.at[entry.name, 'taxid']
                except KeyError:
                    print("Error: couldn't find taxid for {}".format(
                        entry.name))
                    sys.exit(1)
                output_tsv.write('{name}\t{taxid}\n'.format(name=entry.name,
                                                            taxid=taxid))
コード例 #18
0
ファイル: utils.py プロジェクト: Wytamma/sketchy
def get_output_handle(fpath: str, fastx: bool = False, out: bool = True):

    if fpath == "-":
        if out:
            handle = sys.stdout
        else:
            handle = sys.stdin
    else:
        p = Path(fpath)
        if not p.parent.is_dir():
            raise NotADirectoryError(
                "Directory specified for output file does not exist: {}".
                format(p.parent))

        if fastx:
            handle = FastxFile(p)
        else:
            handle = p.open("w")

    return handle
コード例 #19
0
ファイル: datasources.py プロジェクト: nanoporetech/pore-c
 def _open_dataset(self):
     return FastxFile(self._urlpath)
コード例 #20
0
ファイル: fasta.py プロジェクト: wenliangz/sequana
class FastA(object):
    """Class to handle FastA files. Cannot be compressed


    """
    def __init__(self, filename, verbose=False):
        if filename.endswith(".gz"):
            raise ValueError("Must be decompressed.")
        self._fasta = FastxFile(filename)
        self.filename = filename
        logger.info("Reading input fasta file...please wait") 
        self._N = len([x for x in FastxFile(filename)])

    def __iter__(self):
        return self

    def __next__(self): # python 3
        return self.next()

    def next(self): # python 2
        # reads 4 lines
        try:
            d = next(self._fasta)
            return d
        except KeyboardInterrupt:
            # This should allow developers to break a loop that takes too long
            # through the reads to run forever
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
        except:
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
            raise StopIteration
        return d

    def __len__(self):
        return self._N

    def _get_names(self):
        return [this.name for this in self]
    names = property(_get_names)

    def _get_sequences(self):
        return [this.sequence for this in self]
    sequences = property(_get_sequences)

    def _get_comment(self):
        return [this.comment for this in self]
    comments = property(_get_comment)

    def _get_lengths(self):
        return [len(this.sequence) for this in self]
    lengths = property(_get_lengths)

    def get_lengths_as_dict(self):
        return dict(zip(self.names, self.lengths))

    def format_contigs_denovo(self, output_file, len_min=500):
        """Replace NODE with the project name and remove contigs with a length 
        lower than len_min.

        :param str output_file: output file name.
        :param int len_min: minimal length of contigs.

        Example:

            from sequana import FastA

            contigs = FastA("denovo_assembly.fasta")
            contigs.format_contigs_denovo("path/to/file.fasta", len_min=500)

        Results are stored in "path/to/file.fasta".
        """
        # catch basename of file without extension
        project = os.path.basename(output_file).split(".")[0]
        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
        except FileNotFoundError:
            pass

        n = 1
        with open(output_file, "w") as fp:
            for contigs in self:
                if len(contigs.sequence) < len_min:
                    break
                name = ">{}_{} {}\n".format(project, n, contigs.name)
                sequence = "\n".join([contigs.sequence[i:min(i+80, 
                    len(contigs.sequence))] for i in range(0, 
                    len(contigs.sequence), 80)]) + "\n"
                fp.write(name + sequence)
                n += 1

    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN) # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i+1)
        return cherries

    def get_stats(self):
        stats = {}
        stats["N"] = 2
        stats["mean_length"] = mean(self.lengths)
        return stats
コード例 #21
0
                logger.warning(f"Sample {sample} is not in the VCF [skipping]")
            else:
                s.add(sample)
        if len(s) == 0:
            logger.warning(
                "No valid samples found in {} - using all samples in VCF", samples_fname
            )
        else:
            samples = s
else:
    logger.info("Using all samples in VCF")
logger.info(f"Loaded {len(samples)} samples")

logger.info("Determining which strand each gene is on...")
strand: Dict[str, str] = dict()
for entry in FastxFile(fasta_ref):
    for field in entry.comment.rstrip().split():
        if field.startswith("strand"):
            strand[entry.name] = field[7]
    if entry.name not in strand:
        raise ValueError(f"Couldn't find strand for {entry.name}")

logger.info("Extracting consensus sequences with bcftools consensus...")
with TemporaryDirectory() as tmpdirname:
    for sample in samples:
        outname = Path(tmpdirname) / f"{sample}.fa"
        args = (
            "bcftools",
            "consensus",
            "-s",
            sample,
コード例 #22
0
ファイル: util.py プロジェクト: kishwarshafin/pomoxis
def get_seq_lens(fastx):
    """Get sequence lengths from fastx file"""
    return [len(r.sequence) for r in FastxFile(fastx)]
コード例 #23
0
def sequence_cleaner(fasta_q_file, min_length=0, percentage_n=100.0, concatenate_duplicates=True, remove_ambiguous=False):
    """Read FASTA/FASTQ file and clean the file.

    Args:
        fasta_q_file (str): Path to FASTA/Q file.
        min_length (str): Minimum length allowed (default=0 - allows all the lengths).
        percentage_n (float): % of N is allowed (default=100).
        concatenate_duplicates (bool): Remove duplicate and keep one sequence (default=True).
        remove_ambiguous (bool): Remove any sequence with an ambiguous base (default=False).

    Returns:
        collections.defaultdict: Hash with clean sequences.
        int: # Sequences Processed.
        int: # Repeated Sequences.
        int: # Repeated Sequences (Reverse Complement).
        int: # Short Sequences.
        int:  # High N Sequences.

    """
    hash_sequences = defaultdict(list)

    total_sequences_processed = 0
    total_repeated_sequences = 0
    total_repeated_sequences_rc = 0
    total_short_sequences = 0
    total_high_n_sequences = 0

    with FastxFile(fasta_q_file) as fh:
        for entry in fh:
            total_sequences_processed += 1
            sequence_id = entry.name
            sequence = entry.sequence.upper()

            found_ambiguous = False
            if remove_ambiguous:
                for base in sequence:
                    # found ambiguous base. Sequence is skipped
                    if base in AMBIGUOUS_BASES:
                        found_ambiguous = True
                        break
            if not found_ambiguous:
                # remove sequences that are shorter or equal to `min_length`
                if len(sequence) <= min_length:
                    total_short_sequences += 1
                    continue
                # remove sequences that do noot meet the % N
                elif (float(sequence.count("N")) / float(len(sequence))) * 100 > percentage_n:
                    total_high_n_sequences += 1
                    continue

                elif concatenate_duplicates:
                    # repeated sequence - add sequence ID to hash
                    if sequence in hash_sequences:
                        hash_sequences[sequence].append(sequence_id)
                        total_repeated_sequences += 1
                    else:
                        rc = reverse_complement(sequence)
                        # check if reverse complement is already in hash
                        # if so, add modified ID and flags that the sequence reverse complement was repeated
                        if rc in hash_sequences:
                            hash_sequences[rc].append("{}_RC".format(sequence_id))
                            total_repeated_sequences += 1
                            total_repeated_sequences_rc += 1

                        # if not, it means it was the first time the sequence was seen - add it to hash
                        else:
                            hash_sequences[sequence].append(sequence_id)
                else:
                    hash_sequences[sequence_id].append(sequence)


    return (hash_sequences, total_sequences_processed, total_repeated_sequences, total_repeated_sequences_rc,
            total_short_sequences, total_high_n_sequences)
コード例 #24
0
ファイル: fasta.py プロジェクト: sequana/sequana
class FastA(object):
    """Class to handle FastA files. Cannot be compressed


    """
    def __init__(self, filename, verbose=False):
        if filename.endswith(".gz"):  #pragma: no cover
            raise ValueError("Must be decompressed.")
        self._fasta = FastxFile(filename)
        self.filename = filename
        self._N = None

    def __iter__(self):
        return self

    def __next__(self):  # python 3
        return self.next()

    def next(self):  # python 2
        # reads 4 lines
        try:
            d = next(self._fasta)
            return d
        except KeyboardInterrupt:  #pragma: no cover
            # This should allow developers to break a loop that takes too long
            # through the reads to run forever
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
        except:
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
            raise StopIteration

    def __len__(self):
        if self._N is None:
            logger.info("Reading input fasta file...please wait")
            self._N = len([x for x in FastxFile(self.filename)])
        return self._N

    def _get_names(self):
        return [this.name for this in self]

    names = property(_get_names)

    def _get_sequences(self):
        return [this.sequence for this in self]

    sequences = property(_get_sequences)

    def _get_comment(self):
        return [this.comment for this in self]

    comments = property(_get_comment)

    def _get_lengths(self):
        return [len(this.sequence) for this in self]

    lengths = property(_get_lengths)

    def get_lengths_as_dict(self):
        return dict(zip(self.names, self.lengths))

    def format_contigs_denovo(self, output_file, len_min=500):
        """Replace NODE with the project name and remove contigs with a length 
        lower than len_min.

        :param str output_file: output file name.
        :param int len_min: minimal length of contigs.

        Example:

            from sequana import FastA

            contigs = FastA("denovo_assembly.fasta")
            contigs.format_contigs_denovo("path/to/file.fasta", len_min=500)

        Results are stored in "path/to/file.fasta".
        """
        # catch basename of file without extension
        project = os.path.basename(output_file).split(".")[0]
        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):  #pragma: no cover
                os.makedirs(output_dir)
        except FileNotFoundError:  #pragma: no cover
            pass

        n = 1
        with open(output_file, "w") as fp:
            for contigs in self:
                if len(contigs.sequence) < len_min:
                    break
                name = ">{}_{} {}\n".format(project, n, contigs.name)
                sequence = "\n".join([
                    contigs.sequence[i:min(i + 80, len(contigs.sequence))]
                    for i in range(0, len(contigs.sequence), 80)
                ]) + "\n"
                fp.write(name + sequence)
                n += 1

    def filter(self,
               output_filename,
               names_to_keep=None,
               names_to_exclude=None):
        if names_to_exclude is None and names_to_keep is None:  #pragma: no cover
            logger.warning("No ids provided")
            return

        if names_to_exclude:
            with open(self.filename) as fin:
                with open(output_filename, "w") as fout:
                    skip = False
                    # do no use readlines. may be slower but may cause memory
                    # issue
                    for line in fin:
                        if line.startswith(">"):
                            if line[1:].split()[0] in names_to_exclude:
                                skip = True
                            else:
                                skip = False
                        if skip is False:
                            fout.write(line)
        elif names_to_keep:
            with open(self.filename) as fin:
                with open(output_filename, "w") as fout:
                    # do no use readlines. may be slower but may cause memory
                    # issue
                    skip = True
                    for line in fin:
                        if line.startswith(">"):
                            if line[1:].split()[0] in names_to_keep:
                                skip = False
                            else:
                                skip = True
                        if skip is False:
                            fout.write(line)

    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN)  # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i + 1)
        return cherries

    def get_stats(self):
        from pylab import mean
        stats = {}
        stats["N"] = len(self.sequences)
        stats["mean_length"] = mean(self.lengths)
        stats["total_length"] = sum(self.lengths)
        from sequana.stats import N50, L50
        stats["N50"] = N50(self.lengths)
        stats["L50"] = L50(self.lengths)
        stats["min_length"] = min(self.lengths)
        stats["max_length"] = max(self.lengths)
        return stats

    def summary(self, max_contigs=-1):
        from pylab import mean, argmax
        # used by sequana summary fasta
        summary = {"number_of_contigs": len(self.sequences)}
        summary["total_contigs_length"] = sum(self.lengths)
        summary["mean_contig_length"] = mean(self.lengths)
        summary["max_contig_length"] = max(self.lengths)
        summary["min_contig_length"] = min(self.lengths)
        N = 0
        lengths = self.lengths[:]
        positions = list(range(len(lengths)))
        stats = self.get_stats()
        print("#sample_name: {}".format(self.filename))
        print("#total length: {}".format(stats['total_length']))
        print("#N50: {}".format(stats['N50']))
        print("#Ncontig: {}".format(stats['N']))
        print("#L50: {}".format(stats['L50']))
        print("#max_contig_length: {}".format(stats['max_length']))
        print("#min_contig_length: {}".format(stats['min_length']))
        print("#mean_contig_length: {}".format(stats['mean_length']))

        print("contig name,length,count A,C,G,T,N")
        if max_contigs == -1:
            max_contigs = len(lengths) + 1
        while lengths and N < max_contigs:
            N += 1
            index = argmax(lengths)
            length = lengths.pop(index)
            position = positions.pop(index)
            sequence = self.sequences[position]
            name = self.names[position]
            print("{},{},{},{},{},{},{}".format(name, length,
                                                sequence.count('A'),
                                                sequence.count('C'),
                                                sequence.count('G'),
                                                sequence.count('T'),
                                                sequence.count('N')))

    def GC_content_sequence(self, sequence):
        GC = sequence.count('G') + sequence.count('g')
        GC += sequence.count('C') + sequence.count('c')
        return GC / len(sequence) * 100

    def GC_content(self):
        lengths = sum(self.lengths)
        GC = 0
        for seq in self.sequences:
            GC += seq.count('G') + seq.count('g')
            GC += seq.count('C') + seq.count('c')
        return GC / lengths * 100

    def reverse_and_save(self, filename):
        with open(filename, "w") as fout:
            for read in self:
                fout.write(">{}\t{}\n{}\n".format(read.name, read.comment,
                                                  read.sequence[::-1]))

    def save_ctg_to_fasta(self, ctgname, outname, max_length=-1):
        index = self.names.index(ctgname)
        with open("{}.fa".format(outname), "w") as fout:

            if max_length == -1:
                fout.write(">{}\n{}".format(outname, self.sequences[index]))
            else:
                fout.write(">{}\n{}".format(
                    outname, self.sequences[index][0:max_length]))

    def to_fasta(self, outfile, width=80):
        """Save the input FastA file into a new file

        The interest of this method is to wrap the sequence into 80 characters.
        This is useful if the input file is not formatted correctly.

        """
        with open(outfile, "w") as fout:
            for name, comment, seq in zip(self.names, self.comments,
                                          self.sequences):
                import textwrap
                seq = "\n".join(textwrap.wrap(seq, width))
                if comment is None:
                    fout.write(">{}\n{}\n".format(name, seq))
                else:
                    fout.write(">{}\t{}\n{}\n".format(name, comment, seq))

    def to_igv_chrom_size(self, output):
        data = self.get_lengths_as_dict()
        with open(output, "w") as fout:
            for k, v in data.items():
                fout.write("{}\t{}\n".format(k, v))
コード例 #25
0
ファイル: fasta.py プロジェクト: sequana/sequana
 def __init__(self, filename, verbose=False):
     if filename.endswith(".gz"):  #pragma: no cover
         raise ValueError("Must be decompressed.")
     self._fasta = FastxFile(filename)
     self.filename = filename
     self._N = None
コード例 #26
0
ファイル: fasta.py プロジェクト: ranjit58/sequana
class FastA(object):
    """Class to handle FastA files. Cannot be compressed


    """
    def __init__(self, filename, verbose=False):
        if filename.endswith(".gz"):
            raise ValueError("Must be decompressed.")
        self._fasta = FastxFile(filename)
        self._N = len([x for x in FastxFile(filename)])

    def __iter__(self):
        return self

    def __next__(self):  # python 3
        return self.next()

    def next(self):  # python 2
        # reads 4 lines
        try:
            d = next(self._fasta)
            return d
        except KeyboardInterrupt:
            # This should allow developers to break a loop that takes too long
            # through the reads to run forever
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
        except:
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
            raise StopIteration
        return d

    def __len__(self):
        return self._N

    def _get_names(self):
        return [this.name for this in self]

    names = property(_get_names)

    def _get_sequences(self):
        return [this.sequence for this in self]

    sequences = property(_get_sequences)

    def _get_comment(self):
        return [this.comment for this in self]

    comments = property(_get_comment)

    def _get_lengths(self):
        return [len(this.sequence) for this in self]

    lengths = property(_get_lengths)

    def format_contigs_denovo(self, output_file, len_min=500):
        """Replace NODE with the project name and remove contigs with a length 
        lower than len_min.

        :param str output_file: output file name.
        :param int len_min: minimal length of contigs.

        Example:

            from sequana import FastA

            contigs = FastA("denovo_assembly.fasta")
            contigs.format_contigs_denovo("path/to/file.fasta", len_min=500)

        Results are stored in "path/to/file.fasta".
        """
        # catch basename of file without extension
        project = os.path.basename(output_file).split(".")[0]
        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
        except FileNotFoundError:
            pass

        n = 1
        with open(output_file, "w") as fp:
            for contigs in self:
                if len(contigs.sequence) < len_min:
                    break
                name = ">{}_{} {}\n".format(project, n, contigs.name)
                sequence = "\n".join([
                    contigs.sequence[i:min(i + 80, len(contigs.sequence))]
                    for i in range(0, len(contigs.sequence), 80)
                ]) + "\n"
                fp.write(name + sequence)
                n += 1
コード例 #27
0
ファイル: fasta.py プロジェクト: sequana/sequana
 def __len__(self):
     if self._N is None:
         logger.info("Reading input fasta file...please wait")
         self._N = len([x for x in FastxFile(self.filename)])
     return self._N
コード例 #28
0
ファイル: fasta.py プロジェクト: ranjit58/sequana
 def __init__(self, filename, verbose=False):
     if filename.endswith(".gz"):
         raise ValueError("Must be decompressed.")
     self._fasta = FastxFile(filename)
     self._N = len([x for x in FastxFile(filename)])