Ejemplo n.º 1
0
    def __init__(self,
                 inst_dir=conf_constants.hmmer_inst_dir,
                 tmp_dir=None,
                 config_path=None,
                 logger=None):

        if tmp_dir is None:
            tmp_dir = generate_random_string(10) + "_hmm_tmp"

        self.tmp_dir = tmp_dir
        self.inst_dir = inst_dir
        self.logger = logger

        super(HmmerHandler, self).__init__(config_path=config_path)
Ejemplo n.º 2
0
    def __init__(self, seqs_order, matr, short_to_full_seq_names=None, **kwargs):
        self.seqs_order = seqs_order
        assert isinstance(matr, np.ndarray), "ERROR: value for 'matr' argument should be numpy.ndarray"
        self.matr = matr
        self.short_to_full_seq_names = short_to_full_seq_names
        if self.short_to_full_seq_names is None:
            self.short_to_full_seq_names = dict()

        self.aln_name = kwargs.get("aln_name", None)
        self.aln_type = kwargs.get("aln_type", None)
        self.calc_method = kwargs.get("calc_method", None)

        self.emboss_inst_dir = kwargs.get("emboss_inst_dir", self.default_emboss_inst_dir)
        self.tmp_dir = kwargs.get("tmp_dir", generate_random_string(10) + "_dm_tmp")
        self.logger = kwargs.get("logger", None)
Ejemplo n.º 3
0
    def __init__(self,
                 inst_dir=None,
                 tmp_dir=None,
                 config_path=None,
                 logger=None):

        if tmp_dir is None:
            tmp_dir = generate_random_string(10) + "_blast_tmp"

        self.inst_dir = inst_dir
        if self.inst_dir is None:
            self.inst_dir = conf_constants.blast_inst_dir
        self.tmp_dir = tmp_dir
        self.logger = logger

        super(BlastHandler, self).__init__(config_path=config_path)
Ejemplo n.º 4
0
    def _sub_mult_aln(self,
                      mult_aln_dict,
                      aln_type=None,
                      states_seq=None,
                      aln_name="mult_aln",
                      tmp_dir=None,
                      short_to_full_seq_names=None,
                      emboss_inst_dir=None,
                      hmmer_inst_dir=None,
                      kaks_calculator_exec_path=None,
                      config_path=None,
                      logger=None):

        if aln_type is None:
            aln_type = self.aln_type
        if states_seq is None:
            states_seq = list()
        if tmp_dir is None:
            tmp_dir = generate_random_string(10) + "_mult_aln_tmp"
        if short_to_full_seq_names is None:
            short_to_full_seq_names = self.short_to_full_seq_names.copy()
        if emboss_inst_dir is None:
            emboss_inst_dir = self.emboss_inst_dir
        if hmmer_inst_dir is None:
            hmmer_inst_dir = self.hmmer_inst_dir
        if kaks_calculator_exec_path is None:
            kaks_calculator_exec_path = self.kaks_calculator_exec_path
        if config_path is None:
            config_path = self.config_path
        if logger is None:
            logger = self.logger

        mult_aln = MultAln(mult_aln_dict=mult_aln_dict,
                           aln_type=aln_type,
                           aln_name=aln_name,
                           tmp_dir=tmp_dir,
                           states_seq=states_seq,
                           emboss_inst_dir=emboss_inst_dir,
                           hmmer_inst_dir=hmmer_inst_dir,
                           kaks_calculator_exec_path=kaks_calculator_exec_path,
                           config_path=config_path,
                           logger=logger)
        mult_aln.short_to_full_seq_names = short_to_full_seq_names
        return mult_aln
Ejemplo n.º 5
0
 def load_from_dict(cls, in_dict, low_memory=False, **kwargs):
     if low_memory:
         dat_path = kwargs.get(
             "dat_path",
             "." + generate_random_string(10) + "_seqs_dict.dat")
         seqs_array = np.memmap(dat_path,
                                dtype=np.dtype("S1000"),
                                mode='w+',
                                shape=len(in_dict))
     else:
         seqs_array = np.zeros(len(in_dict), dtype=np.dtype("S1000"))
     seqs_order = dict()
     for seq_i, seq_id in enumerate(in_dict):
         seqs_order[seq_id] = seq_i
         if len(in_dict[seq_id]) > seqs_array.itemsize:
             seqs_array = seqs_array.astype(
                 dtype=np.dtype("S%s" % len(in_dict[seq_id])))
         seqs_array[seq_i] = in_dict[seq_id]
     return cls(seqs_order=seqs_order,
                seqs_array=seqs_array,
                low_memory=low_memory)
Ejemplo n.º 6
0
    def __init__(self,
                 mult_aln_dict=None,
                 aln_type=None,
                 states_seq=None,
                 aln_name="mult_aln",
                 tmp_dir=None,
                 emboss_inst_dir=None,
                 hmmer_inst_dir=None,
                 kaks_calculator_exec_path=None,
                 config_path=None,
                 logger=None):

        if emboss_inst_dir is None:
            emboss_inst_dir = conf_constants.emboss_inst_dir
        if hmmer_inst_dir is None:
            hmmer_inst_dir = conf_constants.hmmer_inst_dir
        if kaks_calculator_exec_path is None:
            kaks_calculator_exec_path = conf_constants.kaks_calculator_exec_path

        self.short_to_full_seq_names = dict()
        if mult_aln_dict:
            self.mult_aln_dict = mult_aln_dict
        else:
            self.mult_aln_dict = dict()
        self.states_seq = states_seq
        if self.states_seq is None:
            self.states_seq = list()
        self.aln_type = aln_type
        if not self.aln_type:
            self.aln_type = detect_seqs_type(fasta_dict=self.mult_aln_dict)
        self._distance_matrix = None
        self.aln_name = aln_name
        if tmp_dir is None:
            tmp_dir = generate_random_string(10) + "_mult_aln_tmp"
        self.tmp_dir = tmp_dir
        self.emboss_inst_dir = emboss_inst_dir
        self.hmmer_inst_dir = hmmer_inst_dir
        self.kaks_calculator_exec_path=kaks_calculator_exec_path
        self.logger = logger
        super(MultAln, self).__init__(config_path=config_path)
Ejemplo n.º 7
0
 def get_sample(self, seqs, low_memory='auto', **kwargs):
     if low_memory == "auto":
         if self.low_memory:
             low_memory = True
         else:
             low_memory = False
     if low_memory:
         dat_path = kwargs.get(
             "dat_path",
             "." + generate_random_string(10) + "_seqs_dict.dat")
         seqs_array = np.memmap(dat_path,
                                dtype=self.seqs_array.dtype,
                                mode='w+',
                                shape=len(seqs))
     else:
         seqs_array = np.zeros(len(seqs), dtype=self.seqs_array.dtype)
     seqs_order = dict()
     for i, seq in enumerate(seqs):
         seqs_order[seq] = i
         seqs_array[i] = self[seq]
     return SeqsDict(seqs_order=seqs_order,
                     seqs_array=seqs_array,
                     low_memory=low_memory)
Ejemplo n.º 8
0
 def load_from_file(cls,
                    seqs_path,
                    seqs_format="fasta",
                    low_memory='auto',
                    **kwargs):
     read_chunk_size = kwargs.get("read_chink_size", 100)
     exp_seq_len = kwargs.get("exp_seq_len", 1000)
     if low_memory == 'auto':
         low_memory = cls._check_low_memory(seqs_path=seqs_path)
     seq_i = 0
     seqs_order = dict()
     if low_memory:
         dat_path = kwargs.get(
             "dat_path",
             "." + generate_random_string(10) + "_seqs_dict.dat")
         seqs_array = np.memmap(dat_path,
                                dtype=np.dtype("S%s" % exp_seq_len),
                                mode='w+',
                                shape=read_chunk_size)
     else:
         seqs_array = np.zeros(read_chunk_size,
                               dtype=np.dtype("S%s" % exp_seq_len))
     if seqs_format.lower() == "fasta":
         seq_list = list()
         title = None
         fasta_f = open(seqs_path)
         for line_ in fasta_f:
             line = None
             line = line_.strip()
             if not line:
                 continue
             if line[0] == ">":
                 if title:
                     seqs_order[title] = seq_i
                     new_seq = None
                     if kwargs.get("restore_stops", False):
                         new_seq = "".join(seq_list).replace("X", "*")
                     else:
                         new_seq = "".join(seq_list)
                     if len(new_seq) > seqs_array.itemsize:
                         if low_memory:
                             seqs_array = np_memmap_astype(
                                 dat_path=seqs_array.filename,
                                 old_dtype=seqs_array.dtype,
                                 new_dtype=np.dtype("S%s" % len(new_seq)),
                                 shape=seqs_array.shape)
                         else:
                             seqs_array = seqs_array.astype(
                                 np.dtype("S%s" % len(new_seq)))
                     seqs_array[seq_i] = new_seq
                     title = None
                     seq_i += 1
                 if seq_i > 0 and float(seq_i) % float(
                         read_chunk_size) == 0:
                     if low_memory:
                         seqs_array = np.memmap(seqs_array.filename,
                                                dtype=seqs_array.dtype,
                                                mode='r+',
                                                shape=seqs_array.shape[0] +
                                                read_chunk_size,
                                                order='C')
                     else:
                         seqs_array = np.concatenate(
                             (seqs_array,
                              np.zeros(read_chunk_size,
                                       dtype=seqs_array.dtype)))
                 title = line[1:]
                 seq_list = list()
             else:
                 seq_list.append(line)
         if title:
             seqs_order[title] = seq_i
             new_seq = None
             new_seq = None
             if kwargs.get("restore_stops", False):
                 new_seq = "".join(seq_list).replace("X", "*")
             else:
                 new_seq = "".join(seq_list)
             if len(new_seq) > seqs_array.itemsize:
                 seqs_array = seqs_array.astype(
                     np.dtype("S%s" % len(new_seq)))
             seqs_array[seq_i] = new_seq
             seq_list = list()
             title = None
             seq_i += 1
         fasta_f.close()
     if seqs_array.shape[0] > seq_i:
         seqs_array = seqs_array[:seq_i]
     return cls(seqs_order=seqs_order,
                seqs_array=seqs_array,
                low_memory=low_memory)
Ejemplo n.º 9
0
def construct_mult_aln(seq_dict=None,
                       fasta_path=None,
                       method="MUSCLE",
                       aln_type=None,
                       muscle_exec_path=None,
                       mafft_exec_path=None,
                       msaprobs_exec_path=None,
                       emboss_inst_dir=None,
                       hmmer_inst_dir=None,
                       aln_name="mult_aln",
                       tmp_dir=None,
                       remove_tmp=True,
                       num_threads=None,
                       config_path=None,
                       logger=None,
                       **kwargs):

    if muscle_exec_path is None:
        muscle_exec_path = conf_constants.muscle_exec_path
    if mafft_exec_path is None:
        mafft_exec_path = conf_constants.mafft_exec_path
    if msaprobs_exec_path is None:
        msaprobs_exec_path = conf_constants.msaprobs_exec_path
    if emboss_inst_dir is None:
        emboss_inst_dir = conf_constants.emboss_inst_dir
    if hmmer_inst_dir is None:
        hmmer_inst_dir = conf_constants.hmmer_inst_dir
    if tmp_dir is None:
        tmp_dir = generate_random_string(10) + "_mult_aln_tmp"

    if not fasta_path and seq_dict:
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)
        fasta_path = os.path.join(tmp_dir, "seqs_to_aln.fasta")
        dump_fasta_dict(fasta_dict=seq_dict, fasta_path=fasta_path, replace_stops="X")  # maybe it is not good (* > X)
    if not fasta_path:
        if logger:
            logger.warning("No sequences input")
        else:
            print "No sequences input"
        return 1
    if not aln_type:
        detect_seqs_type(fasta_path)
    out_fasta_path = os.path.join(tmp_dir, aln_name+".fasta")

    if method.lower() == "muscle":
        if logger:
            logger.info("MUSCLE is starting")
        else:
            print("MUSCLE is starting")
        muscle_cmd = muscle_exec_path + " -in " + fasta_path + " -out " + out_fasta_path
        subprocess.call(muscle_cmd, shell=True)
        if logger:
            logger.info("MUSCLE finished")
        else:
            print("MUSCLE finished")

    if method.lower() == "mafft":
        if num_threads is None:
            num_threads = conf_constants.num_threads
        if logger:
            logger.info("MAFFT is starting")
        else:
            print("MAFFT is starting")
        mafft_cmd = mafft_exec_path + " --auto --op " + str(kwargs.get("op", kwargs.get("gap_open_penalty", 1.53))) + \
                    " --ep " + str(kwargs.get("ep", kwargs.get("gap_ext_penalty", 0.123))) + " --thread " + str(num_threads) + \
                    " " + fasta_path + " > " + out_fasta_path
        subprocess.call(mafft_cmd, shell=True)
        if logger:
            logger.info("MAFFT finished")
        else:
            print("MAFFT finished")

    if method.lower() == "msaprobs":
        if num_threads is None:
            num_threads = conf_constants.num_threads
        if logger:
            logger.info("MSAProbs is starting")
        else:
            print("MSAProbs is starting")
        msaprobs_cmd = msaprobs_exec_path + " -num_threads " + str(num_threads) + " -v " + \
                       fasta_path + " > " + out_fasta_path
        subprocess.call(msaprobs_cmd, shell=True)
        if logger:
            logger.info("MSAProbs finished")
        else:
            print("MSAProbs finished")

    mult_aln = MultAln.load_alignment(aln_fasta_path=out_fasta_path,
                                      aln_type=aln_type,
                                      aln_name=aln_name,
                                      config_path=config_path,
                                      logger=logger,
                                      restore_stops=True,
                                      **kwargs)
    mult_aln.emboss_inst_dir = emboss_inst_dir
    mult_aln.hmmer_inst_dir = hmmer_inst_dir
    mult_aln.tmp_dir = tmp_dir
    if remove_tmp:
        shutil.rmtree(tmp_dir)
    return mult_aln