Example #1
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     if (options().backbone_size is None):            
         options().backbone_size = min(100,int(.20*sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
     
     _LOG.info("Writing query and backbone set. ")
     query = get_temp_file("query", "backbone", ".fas")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(sequences, query)
     _write_fasta(backbone_sequences, backbone)
             
     _LOG.info("Generating sate backbone alignment and tree. ")
     satealignJob = SateAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     satealignJob.run()
     satealignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/sate.fasta")
     options().tree_file = open(self.options.outdir + "/sate.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     options().fragment_file = query
Example #2
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     fragments = MutableAlignment()
     if (options().median_full_length is not None):
       if (options().median_full_length == -1):
         seq_lengths = sorted([len(seq) for seq in sequences.values()])              
         lengths = len(seq_lengths)
         if lengths % 2:
           options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0
         else:
           options().median_full_length = seq_lengths[lengths / 2]              
         
       (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold)))
       frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length]
       if (len(frag_names) > 0):
           fragments = sequences.get_hard_sub_alignment(frag_names)        
           [sequences.pop(i) for i in fragments.keys()]        
     if (options().backbone_size is None):            
         options().backbone_size = min(1000,int(sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     if (options().backbone_size > len(sequences.keys())):
       options().backbone_size = len(sequences.keys())
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
             
     _LOG.info("Writing backbone set. ")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(backbone_sequences, backbone)
      
     _LOG.info("Generating pasta backbone alignment and tree. ")
     pastaalignJob = PastaAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     pastaalignJob.run()
     pastaalignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/pasta.fasta")
     options().tree_file = open(self.options.outdir + "/pasta.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     sequences.set_alignment(fragments)        
     if (len(sequences) == 0):
       _LOG.info("No query sequences to align.  Final alignment saved as %s" % self.get_output_filename("alignment.fasta"))   
       shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta"))
       sys.exit(0)
     else:
       query = get_temp_file("query", "backbone", ".fas")
       options().fragment_file = query          
       _write_fasta(sequences, query)               
Example #3
0
    def read_and_divide_fragments(self, chunks, extra_frags={}):
        max_chunk_size = self.options.max_chunk_size
        _LOG.debug(
            "start reading fragment files and breaking to at least %s chunks but at most %s sequences "
            % (str(chunks), str(max_chunk_size)))
        self.root_problem.fragments = MutableAlignment()
        self.root_problem.fragments.read_file_object(
            self.options.fragment_file)

        # test if input fragment names might collide with reference names.
        # code contribution by Stefan Janssen (June 13th, 2018)
        ids_reference = set(self.root_problem.subalignment.keys())
        ids_inputfragments = set(self.root_problem.fragments.keys())
        ids_overlap = ids_reference & ids_inputfragments
        if len(ids_overlap) > 0:
            raise ValueError(
                ("Your input fragment file contains %i sequences, whose names "
                 "overlap with names in your reference. Please rename your inp"
                 "ut fragments and re-start. Duplicate names are:\n  '%s'") %
                (len(ids_overlap), "'\n  '".join(ids_overlap)))

        for (k, v) in extra_frags.items():
            self.root_problem.fragments[k] = v.replace("-", "")
        alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(
            chunks, max_chunk_size)
        ret = []
        for i in range(0, len(alg_chunks)):
            temp_file = None
            if alg_chunks[i]:
                temp_file = get_temp_file("fragment_chunk_%d" % i,
                                          "fragment_chunks", ".fasta")
                alg_chunks[i].write_to_path(temp_file)
            ret.append(temp_file)
        _LOG.debug("fragment files read and divided.")
        return ret
Example #4
0
    def read_and_divide_fragments(self, chunks, extra_frags={}):
        max_chunk_size = self.options.max_chunk_size
        _LOG.debug(
            ("start reading fragment files and breaking to at least %s chunks"
             " but at most %s sequences ") %
            (str(chunks), str(max_chunk_size)))
        self.root_problem.fragments = MutableAlignment()
        self.root_problem.fragments.read_file_object(
            self.options.fragment_file)

        # test if input fragment names might collide with reference names.
        # code contribution by Stefan Janssen (June 13th, 2018)
        ids_reference = set(self.root_problem.subalignment.keys())
        ids_inputfragments = set(self.root_problem.fragments.keys())
        ids_overlap = ids_reference & ids_inputfragments
        if len(ids_overlap) > 0 and not self.options.ignore_overlap:
            raise ValueError(
                ("Your input fragment file contains %i sequences, whose names "
                 "overlap with names in your reference. Please rename your inp"
                 "ut fragments and re-start. Duplicate names are:\n  '%s'") %
                (len(ids_overlap), "'\n  '".join(ids_overlap)))
        elif len(ids_overlap) > 0:
            _LOG.debug("Ignoring following %i query sequences present "
                       "in the backbone: \n '%s'" %
                       (len(ids_overlap), "' , '".join(ids_overlap)))

            self.root_problem.fragments = self.root_problem.fragments.\
                get_soft_sub_alignment(ids_inputfragments - ids_reference)

        # test if input fragment names contain whitespaces / tabs which would
        # cause hmmsearch to fail.
        # code contribution by Stefan Janssen (June 22nd, 2018)
        ids_inputfragments_spaces = [
            id_ for id_ in ids_inputfragments if (' ' in id_) or ('\t' in id_)
        ]
        if len(ids_inputfragments_spaces) > 0:
            raise ValueError(
                ("Your input fragment file contains %i sequences, whose names "
                 "contain either whitespaces: ' ' or tabulator '\\t' symbols. "
                 "Please rename your input fragments and re-start. Affected "
                 "names are:\n  '%s'") %
                (len(ids_inputfragments_spaces),
                 "'\n  '".join(ids_inputfragments_spaces)))

        for (k, v) in extra_frags.items():
            self.root_problem.fragments[k] = v.replace("-", "")
        alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(
            chunks, max_chunk_size)
        ret = []
        for i in range(0, len(alg_chunks)):
            temp_file = None
            if alg_chunks[i]:
                temp_file = get_temp_file("fragment_chunk_%d" % i,
                                          "fragment_chunks", ".fasta")
                alg_chunks[i].write_to_path(temp_file)
            ret.append(temp_file)
        _LOG.debug("fragment files read and divided.")
        return ret
Example #5
0
    def read_and_divide_fragments(self, chunks, extra_frags={}):
        max_chunk_size = self.options.max_chunk_size
        _LOG.debug(
            ("start reading fragment files and breaking to at least %s chunks"
             " but at most %s sequences ") % (
                str(chunks), str(max_chunk_size)))
        self.root_problem.fragments = MutableAlignment()
        self.root_problem.fragments.read_file_object(
            self.options.fragment_file)

        # test if input fragment names might collide with reference names.
        # code contribution by Stefan Janssen (June 13th, 2018)
        ids_reference = set(self.root_problem.subalignment.keys())
        ids_inputfragments = set(self.root_problem.fragments.keys())
        ids_overlap = ids_reference & ids_inputfragments
        if len(ids_overlap) > 0:
            raise ValueError((
                "Your input fragment file contains %i sequences, whose names "
                "overlap with names in your reference. Please rename your inp"
                "ut fragments and re-start. Duplicate names are:\n  '%s'") %
                (len(ids_overlap), "'\n  '".join(ids_overlap)))

        # test if input fragment names contain whitespaces / tabs which would
        # cause hmmsearch to fail.
        # code contribution by Stefan Janssen (June 22nd, 2018)
        ids_inputfragments_spaces = [
            id_
            for id_ in ids_inputfragments
            if (' ' in id_) or ('\t' in id_)]
        if len(ids_inputfragments_spaces) > 0:
            raise ValueError((
                "Your input fragment file contains %i sequences, whose names "
                "contain either whitespaces: ' ' or tabulator '\\t' symbols. "
                "Please rename your input fragments and re-start. Affected "
                "names are:\n  '%s'") %
                (len(ids_inputfragments_spaces),
                 "'\n  '".join(ids_inputfragments_spaces)))

        for (k, v) in extra_frags.items():
            self.root_problem.fragments[k] = v.replace("-", "")
        alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(
            chunks, max_chunk_size)
        ret = []
        for i in range(0, len(alg_chunks)):
            temp_file = None
            if alg_chunks[i]:
                temp_file = get_temp_file(
                    "fragment_chunk_%d" % i, "fragment_chunks", ".fasta")
                alg_chunks[i].write_to_path(temp_file)
            ret.append(temp_file)
        _LOG.debug("fragment files read and divided.")
        return ret
Example #6
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        if (options().backbone_size is None):
            options().backbone_size = min(100,
                                          int(.20 * sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(
            random.sample(sequences.keys(),
                          options().backbone_size))
        [sequences.pop(i) for i in backbone_sequences.keys()]

        _LOG.info("Writing query and backbone set. ")
        query = get_temp_file("query", "backbone", ".fas")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(sequences, query)
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating sate backbone alignment and tree. ")
        satealignJob = SateAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        satealignJob.setup(backbone,
                           options().backbone_size, self.options.outdir,
                           moleculeType,
                           options().cpu)
        satealignJob.run()
        satealignJob.read_results()

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(self.options.outdir + "/sate.fasta")
        options().tree_file = open(self.options.outdir + "/sate.fasttree")
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        options().fragment_file = query
Example #7
0
 def read_and_divide_fragments(self, chunks, extra_frags = {}):
     _LOG.debug("start reading fragment files and breaking to chunks: %d" %chunks)
     self.root_problem.fragments = MutableAlignment()
     self.root_problem.fragments.read_file_object(self.options.fragment_file)
     for (k,v) in extra_frags.iteritems():
         self.root_problem.fragments[k] = v.replace("-","")
     alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(chunks)        
     ret = []
     for i in xrange(0,chunks):
         temp_file = get_temp_file("fragment_chunk_%d" %i, "fragment_chunks", ".fasta")
         alg_chunks[i].write_to_path(temp_file)
         ret.append(temp_file)            
     _LOG.debug("fragment files read and divided.")
     return ret
Example #8
0
 def read_and_divide_fragments(self, chunks, extra_frags={}):
     _LOG.debug("start reading fragment files and breaking to chunks: %d" %
                chunks)
     self.root_problem.fragments = MutableAlignment()
     self.root_problem.fragments.read_file_object(
         self.options.fragment_file)
     for (k, v) in extra_frags.iteritems():
         self.root_problem.fragments[k] = v.replace("-", "")
     alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(chunks)
     ret = []
     for i in xrange(0, chunks):
         temp_file = get_temp_file("fragment_chunk_%d" % i,
                                   "fragment_chunks", ".fasta")
         alg_chunks[i].write_to_path(temp_file)
         ret.append(temp_file)
     _LOG.debug("fragment files read and divided.")
     return ret
Example #9
0
def save_checkpoint(checkpoint_manager):
    '''
    This is the callback function that is called periodically to save the
    current state of the system.
    '''
    # Note: this module is not bullet proof in terms of race conditions.
    # Most importantly, it is possible (though extremely unlikely) that
    # while the new temp path is being written (f.write...)
    if checkpoint_manager.is_checkpointing:
        # checkpoint_manager.lock.acquire()
        checkpoint_manager.saving = True
        newTmpDest = get_temp_file("dump", "checkpoints")
        _LOG.info("Checkpoint is being updated: %s" % newTmpDest)
        oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines()
        oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split(
            ",")[0]

        checkpoint_manager.update_time()

        currenlimit = sys.getrecursionlimit()
        sys.setrecursionlimit(100000)
        picklefile = gzip.GzipFile(newTmpDest, 'wb')
        pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2)
        picklefile.close()
        sys.setrecursionlimit(currenlimit)

        f = open(checkpoint_manager.checkpoint_path, "a")
        f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now()))
        f.close()
        if oldTmpFile is not None:
            os.remove(oldTmpFile)
        _LOG.info("Checkpoint Saved to: %s and linked in %s." %
                  (newTmpDest, checkpoint_manager.checkpoint_path))
        checkpoint_manager.saving = False
        # checkpoint_manager.lock.release()
        checkpoint_manager.timer = threading.Timer(
            options().checkpoint_interval,
            save_checkpoint,
            args=[checkpoint_manager])
        checkpoint_manager.timer.setDaemon(True)
        checkpoint_manager.timer.start()
Example #10
0
def save_checkpoint(checkpoint_manager):
    '''
    This is the callback function that is called periodically to save the
    current state of the system.
    '''
    # Note: this module is not bullet proof in terms of race conditions.
    # Most importantly, it is possible (though extremely unlikely) that
    # while the new temp path is being written (f.write...)
    if checkpoint_manager.is_checkpointing:
        # checkpoint_manager.lock.acquire()
        checkpoint_manager.saving = True
        newTmpDest = get_temp_file("dump", "checkpoints")
        _LOG.info("Checkpoint is being updated: %s" % newTmpDest)
        oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines()
        oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split(
            ",")[0]

        checkpoint_manager.update_time()

        currenlimit = sys.getrecursionlimit()
        sys.setrecursionlimit(100000)
        picklefile = gzip.GzipFile(newTmpDest, 'wb')
        pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2)
        picklefile.close()
        sys.setrecursionlimit(currenlimit)

        f = open(checkpoint_manager.checkpoint_path, "a")
        f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now()))
        f.close()
        if oldTmpFile is not None:
            os.remove(oldTmpFile)
        _LOG.info("Checkpoint Saved to: %s and linked in %s." % (
            newTmpDest, checkpoint_manager.checkpoint_path))
        checkpoint_manager.saving = False
        # checkpoint_manager.lock.release()
        checkpoint_manager.timer = threading.Timer(
            options().checkpoint_interval, save_checkpoint,
            args=[checkpoint_manager])
        checkpoint_manager.timer.setDaemon(True)
        checkpoint_manager.timer.start()
Example #11
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None
                or options().full_length_range is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (seq_lengths[l2] +
                                                    seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]
            if options().full_length_range is not None:
                L = sorted(int(x) for x in options().full_length_range.split())
                min_length = L[0]
                max_length = L[1]
            else:
                (min_length,
                 max_length) = (int(options().median_full_length *
                                    (1 - options().backbone_threshold)),
                                int(options().median_full_length *
                                    (1 + options().backbone_threshold)))
            _LOG.info(
                "Full length sequences are set to be from %d to %d character long"
                % (min_length, max_length))
            frag_names = [
                name for name in sequences if len(sequences[name]) > max_length
                or len(sequences[name]) < min_length
            ]
            if (len(frag_names) > 0):
                _LOG.info("Detected %d fragmentary sequences" %
                          len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(
            random.sample(sorted(list(sequences.keys())),
                          options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone,
                            options().backbone_size, moleculeType,
                            options().cpu, **vars(options().pasta))
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s" %
                self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)