Ejemplo n.º 1
0
    def to_chunks(self, chunk_path='chunk{:03d}.tsv.gz', chunk_size=1e07):
        """Split MGnify sequences file into chunks
        Given a .fa[.gz] file, makes chunks of <chunk_size> and stores them
        into out_path directory named according to chunk index.
        Note that <chunk_size> is refferred to the number of entries, not the
        number of lines in output chunk, hence chunk sizes are heterogeneous.

        Args
        chunk_path (str)    String containing the path of a generic chunk file,
                            e.g. `chunk{:d}.fa.gz` (must be formattable)
        chunk_size (int)    Maximum number of fasta entries to be stored in
                            each chunk

        Raise
        (FileNotFoundError) If given chunk path is not valid
        """
        # Get output directory
        chunks_dir = os.path.dirname(chunk_path)
        # Case given output directory does not exist
        if not os.path.exists(chunks_dir):
            # Attempt to make a new output directory
            os.mkdir(chunks_dir)
        # Initialize current chunk (batch of fasta sequences entries)
        seq_batch = list()
        # Initialize sequence index
        seq_index = 0
        # Open file for reading
        with open_file(self.path) as file:
            # Loop through every index, line in input file
            for entry in fasta_iter(file):
                # Save current line
                seq_batch.append(entry)
                # Case index reached batch size
                if (seq_index + 1) % chunk_size == 0:
                    # Define chunk index
                    chunk_index = int(seq_index // chunk_size)
                    # Persist chunk to disk
                    self.write_chunk(chunk_path,
                                     chunk_index,
                                     seq_batch,
                                     sep='\n')
                    # Reinitialize chunk content
                    seq_batch = list()
                # Increase line counter
                seq_index += 1
            # Persist last chunk, if any
            if seq_batch:
                # Define chunk index
                chunk_index = int(seq_index // chunk_size)
                # Persist chunk to disk
                self.write_chunk(chunk_path, chunk_index, seq_batch, sep='\n')
        # Define number of chunks
        num_chunks = chunk_index + 1
        # Return number of chunks
        return num_chunks
Ejemplo n.º 2
0
    def from_aln(cls, in_path):
        """Load MSA from .aln file

        Args
        in_path (str)   Path to MSA .aln file (even if gzipped)

        Return
        (MSA)           Loaded MSA object
        """
        # Initialize entries dictionary (key is accession)
        entries = dict()
        # Read file
        with open_file(in_path, 'r', 'rt') as in_file:
            # Loop through each line in file
            for line in in_file:
                # Get current (last) key
                acc = [*entries.keys()][-1] if entries else ''
                # Check if current line is header
                is_header = re.search(r'^([a-zA-Z0-9]+)/(\d+)-(\d+)[ ]*', line)
                # If header, retrieve it
                if is_header:
                    # Retrieve accession
                    acc = str(is_header.group(1))
                    beg = int(is_header.group(2))
                    end = int(is_header.group(3))
                    # Instantiate new entry
                    entries[acc] = {
                        'acc': acc,
                        'beg': beg,
                        'end': end,
                        'res': list()
                    }
                    # Remove header from line
                    line = re.sub(r'^[a-zA-Z0-9]+/[\d]+-[\d]+[ ]*', '', line)
                # # Change dots to minus characters
                # line = re.sub(r'[\.\-]', '-', line)
                # Retrieve residues: remove all non-letter characters
                res = list(re.sub(r'[^\-\.a-zA-Z]+', '', line))
                # Append residues to last entry
                entries[acc]['res'] += res
                # print(entries[acc])
        # Get accession (keys)
        acc = [k for k in entries.keys() if k != '']
        # Get residues matrix
        res = list([entries[k]['res'] for k in acc])
        # Init new MSA object
        msa = cls()
        # Store current data (cast to numpy array)
        msa.acc = np.array(acc, dtype=np.unicode_)
        msa.aln = np.array(res, dtype=np.unicode_)
        msa.beg = np.array([entries[k]['beg'] for k in acc], dtype=np.int)
        msa.end = np.array([entries[k]['end'] for k in acc], dtype=np.int)
        # Return self, allow chaining
        return msa
Ejemplo n.º 3
0
 def get_length(self):
     # Initialize output length
     length = 0
     # Open underlying file
     with open_file(self.path) as file:
         # Loop through each entry in input fasta file
         for entry in fasta_iter(file):
             # Update dataset length
             length += 1
     # Return dataset length
     return length
Ejemplo n.º 4
0
    def search(self, sequences_acc, ret_length=False, verbose=False):
        """Retrieve sequences residues
        Takes a list of sequences accessions and search for the associated
        entry by scanning underlying fasta file headers.

        Args
        sequences_acc (list)    List of sequences accession numbers whose
                                residues must be found in given fasta file
        ret_length (bool)       Wether to return the length of the searched
                                target dataset (disables early stopping
                                criterion)
        verbose (bool)          Whether to print out verbose log

        Return
        (dict(str: str))        Dictionary containing sequences accession
                                numbers as keys and fasta entries as values
        """
        # Cast cluster names to set
        sequences_acc = set(sequences_acc)
        # Initialize output dict(sequence acc: fasta entry) and length
        sequences, length = dict(), 0
        # Verbose out
        if verbose:
            print('Reading sequences file', self.path)
        # Open file with defined file handler
        with open_file(self.path) as file:
            # Define fasta entries iterator
            tqdm_iter = tqdm(
                fasta_iter(file),  # Input iterator
                disable=(not verbose),  # Set verbose
                file=sys.stdout  # Force printing to stdout
            )
            # Loop through each entry in input fasta file
            for entry in tqdm_iter:
                # Split entry in header and residues
                header, resiudes = entry.split('\n')
                # Get accession number from header
                acc = re.search(r'^>(\S+)', header).group(1)
                # Case accession is one of the searched ones
                if acc in sequences_acc:
                    # Store entry
                    sequences[acc] = entry
                # Case all sequences have been found
                if (not ret_length) and (len(sequences) == len(sequences_acc)):
                    break  # Early stopping
                # Case length must be returned
                elif ret_length:
                    length += 1
        # Case length must be returned
        if ret_length:
            return sequences, length
        # Case only sequences must be returned
        else:
            return sequences
Ejemplo n.º 5
0
    def search(self, cluster_names, verbose=False):
        """Retrieve sequences accession numbers
        Takes list of cluster names as input and searches in given clusters
        file the sequences accession numbers associated with that cluster name

        Args
        cluster_names (list(str))   Names of the clusters whose sequences
                                    members must be retrieved
        verbose (bool)              Whether to print out verbose log

        Return
        (dict(str:list))            Dictionary whose keys are cluster names and
                                    values are lists of strings containing
                                    sequences accession numbers associated with
                                    the key's cluster name
        """
        # Cast cluster names to set
        cluster_names = set(cluster_names)
        # Initialize output dictionary
        sequences_acc = {cluster: list() for cluster in cluster_names}
        # Verbose out
        if verbose:
            print('Reading clusters file', self.path)
        # Open dataset file
        with open_file(self.path) as file:
            # Define iterator
            line_iterator = tqdm(file, disable=(not verbose), file=sys.stdout)
            # Loop through every line in file
            for line in line_iterator:
                # Match cluster name and sequence accession
                match = re.search(r'^([a-zA-Z0-9]+)\s+([a-zA-Z0-9]+)', line)
                # Case the line format does not match
                if not match:
                    continue  # Skip iteration
                # Retrieve cluster name and sequence accession
                found_cluster_name = str(match.group(1))
                found_sequence_acc = str(match.group(2))
                # Case cluster name does not match the searched one
                if found_cluster_name not in cluster_names:
                    continue  # Skip iteration
                # Otherwise, store found sequence accesssion
                sequences_acc[found_cluster_name].append(found_sequence_acc)
        # Return dict (cluster name: sequences accession numbers)
        return sequences_acc
Ejemplo n.º 6
0
 def get_longest(self):
     # Initialize current longest entry and its length (number of residues)
     longest_seq, longest_len = '', 0
     # Initailize number of sequences
     num_sequences = 0
     # Open inner dataset file path
     with open_file(self.path) as file:
         # Loop through each file entry
         for entry in fasta_iter(file):
             # Split current entry in header and residues
             header, residues = tuple(entry.split('\n'))
             # Get current sequence and its number of residues
             curr_seq, curr_len = entry, len(residues)
             # Case current sequence is longer than longest
             if curr_len > longest_len:
                 # Update longest sequence and its length
                 longest_seq, longest_len = curr_seq, curr_len
             # Unpate number of sequences
             num_sequences += 1
     # Return either longest sequence and its length
     return longest_seq, longest_len, num_sequences
Ejemplo n.º 7
0
import os
import sys

sys.path.append(os.path.abspath(__file__ + "../../.."))

from src.utils import next_game, open_file
from src.map import Map

if __name__ == "__main__":
    while True:
        first_step = input(
            "How do you want to give map me? File(1) or Keyboard(2)? ")
        treasure = Map()
        if first_step == "1":
            print("Please provide me the path to the file ")
            fl_name = sys.stdin.readline().strip()
            rows = open_file(fl_name)
            treasure.from_file(rows)
            treasure.hunt_treasure()
        elif first_step == "2":
            print(
                "Please enter 25 numbers (each of them should be between 11 and 55) "
            )
            treasure.from_keyboard()
            treasure.hunt_treasure()
        if not next_game():
            break