Beispiel #1
0
    def align(self, input_):
        if self.clustalw_exe is None:
            raise InitError("clustalw executable is not set")

        input_ = self._fix_input(input_)

        input_path = tempfile.mktemp()
        output_path = tempfile.mktemp()

        write_fasta(input_path, input_)

        cmd = [self.clustalw_exe, '-TYPE=PROTEIN', '-OUTPUT=FASTA',
               '-PWMATRIX=BLOSUM', '-OUTFILE=%s' % output_path, '-INFILE=%s' % input_path]

        try:
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            p.wait()

            if p.returncode != 0:
                raise RuntimeError("%s for %s" % (p.stderr.read().decode('ascii'), str(input_)))

            return Alignment(self._fix_output(parse_fasta(output_path)))
        finally:
            for path in [input_path, output_path]:
                if os.path.isfile(path):
                    os.remove(path)
Beispiel #2
0
    def get_domain_ranges(self, sequence):
        if self.url is None:
            raise InitError("interpro url is not set")

        job_id = self._interpro_submit(sequence)

        t0 = time()
        while (time() - t0) < self.job_timout:

            status = self._interpro_status(job_id)

            if status in ['RUNNING', 'PENDING', 'STARTED']:
                sleep(self.poll_interval)
            elif status == 'NOT_FOUND':
                job_id = self._interpro_submit(sequence)
            else:
                break

        if status == 'RUNNING':
            raise ServiceError("inteproscan job timed out")
        elif status in ['FAILURE', 'ERROR']:
            raise ServiceError(self._interpro_error(job_id))
        elif status != 'FINISHED':
            raise ServiceError("inteproscan job status = " + status)

        xml_str = self._interpro_result(job_id)

        return self._parse_interpro_ranges(xml_str)
Beispiel #3
0
    def _get_hits(self, range_, template_id):
        if self.template_blast_databank is None:
            raise InitError("blast databank is not set")

        blast_hits = blaster.blastp(range_.get_sub_sequence(),
                                    self.template_blast_databank)
        _log.debug("{} blast hits to filter".format(len(blast_hits)))

        good_hits = []
        for hit_id in blast_hits:
            for alignment in blast_hits[hit_id]:
                # Must shift the numbers in the blast hit,
                # since we used a sub-sequence.
                alignment.query_shift_right(range_.start)
                alignment.full_query_sequence = range_.sequence

                hit_template_id = TemplateID(
                    alignment.get_hit_accession_code(),
                    alignment.get_hit_chain_id())
                if template_id is not None and hit_template_id != template_id:
                    continue

                if template_id is None and blacklister.is_blacklisted(
                        alignment.get_hit_accession_code()):
                    continue

                if not dssp.has_secondary_structure(hit_template_id):
                    continue

                if alignment.get_percentage_identity() >= get_min_identity(
                        alignment.count_aligned_residues()):
                    good_hits.append(alignment)

        return good_hits
Beispiel #4
0
    def blastp(self, sequence, databank):
        if self.blastp_exe is None:
            raise InitError("blastp executable is not set")

        input_path = tempfile.mktemp()
        output_path = tempfile.mktemp()

        write_fasta(input_path, {'target': sequence})

        cmd = [
            self.blastp_exe, '-query', input_path, '-db', databank, '-outfmt',
            '5', '-out', output_path
        ]

        _log.debug("{}".format(cmd))

        try:
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            p.wait()

            if p.returncode != 0:
                raise RuntimeError(p.stderr.read())

            with open(output_path, 'r') as f:
                xml_str = f.read()
        finally:
            for path in [input_path, output_path]:
                if os.path.isfile(path):
                    os.remove(path)

        return self._parse_alignments(xml_str, sequence, databank)
Beispiel #5
0
def create_model(target_sequence, target_species_id, require_resnum=None, chosen_template_id=None):

    target_species_id = target_species_id.upper()

    sequence_id = model_storage.get_sequence_id(target_sequence)
    lock_name = "lock_search_%s_%s_%s_%s" % (sequence_id,
                                             target_species_id,
                                             str(require_resnum),
                                             str(chosen_template_id))

    if model_storage.model_dir is None:
        raise InitError("model directory is not set")

    lock_path = os.path.join(model_storage.model_dir, lock_name)
    with FileLock(lock_path):

        model_paths = model_storage.list_models(target_sequence, target_species_id,
                                                require_resnum, chosen_template_id)
        if len(model_paths) > 0:
            return select_best_model(model_paths, target_sequence, require_resnum)
        else:
            ModelLogger.get_current().clear()

            domain_alignments = \
                domain_aligner.get_domain_alignments(target_sequence,
                                                     require_resnum,
                                                     chosen_template_id)
            if len(domain_alignments) <= 0:
                _log.warn("no domain alignments for target={} resnum={} template={}"
                          .format(target_sequence, require_resnum, chosen_template_id))
                return None

            domain_alignment = select_best_domain_alignment(domain_alignments)
            return modeler.build_model(target_sequence, target_species_id,
                                       domain_alignment, require_resnum)
Beispiel #6
0
    def blacklist(self, pdbid):
        if self.file_path is None:
            raise InitError("blacklist file not set")

        if not is_blacklisted(pdbid):
            with open(self.file_path, 'a') as f:
                f.write('%s\n' % pdbid)
Beispiel #7
0
    def list_all_models(self):
        if self.model_dir is None:
            raise InitError("model directory is not set")

        wildcard = os.path.join(self.model_dir, "*.tgz")

        paths = glob(wildcard)
        paths = [path for path in paths if '_error' not in path]
        return paths
Beispiel #8
0
    def list_models(self,
                    target_sequence,
                    species_id,
                    required_resnum=None,
                    template_id=None):
        if self.model_dir is None:
            raise InitError("model directory is not set")
        elif not os.path.isdir(self.model_dir):
            raise InitError("No such directory: {}".format(self.model_dir))

        sequence_id = self.get_sequence_id(target_sequence)

        species_id = species_id.upper()

        if template_id is None:
            wildcard = "%s_%s_*.tgz" % (sequence_id, species_id)
        else:
            case_insensitive_pdbid = ""
            for i in range(len(template_id.pdbid)):
                char = template_id.pdbid[i]
                if char.isalpha():
                    case_insensitive_pdbid += '[%s%s]' % (char.lower(),
                                                          char.upper())
                else:
                    case_insensitive_pdbid += char

            wildcard = "%s_%s_*_%s-%s.tgz" % (sequence_id, species_id,
                                              case_insensitive_pdbid,
                                              template_id.chain_id)

        wildcard = os.path.join(self.model_dir, wildcard)

        paths = glob(wildcard)
        paths = [path for path in paths if '_error' not in path]

        if required_resnum is None:
            return paths
        else:
            matching_paths = []
            for path in paths:
                if self.model_covers(path, target_sequence, required_resnum):
                    matching_paths.append(path)

            return matching_paths
Beispiel #9
0
    def is_blacklisted(self, pdbid):
        if self.file_path is None:
            raise InitError("blacklist file not set")

        if os.path.isfile(self.file_path):
            with open(self.file_path, 'r') as f:
                list_ = f.read().split()
                return pdbid in list_

        return False
Beispiel #10
0
    def get_model_lock(self, main_target_sequence, target_species_id,
                       main_domain_alignment, template_id):
        if self.model_dir is None:
            raise InitError("model directory is not set")

        lock_name = 'lock_model_' + self.get_model_name(
            main_target_sequence, target_species_id, main_domain_alignment,
            template_id)
        lock_path = os.path.join(self.model_dir, lock_name)
        return FileLock(lock_path)
Beispiel #11
0
    def get_sequence(self, ac):
        if self.fasta_paths is None:
            raise InitError("fasta paths not set")

        for fasta_path in self.fasta_paths:
            with FastaIterator(fasta_path) as fasta:
                for id_, seq in fasta:
                    if id_.split('|')[1] == ac:
                        return seq

        raise ValueError("sequence not found in uniprot: {}".format(ac))
Beispiel #12
0
    def build_model(self, main_target_sequence, target_species_id, main_domain_alignment, require_resnum=None):

        ModelLogger.get_current().add("building model with sequence {}, species {}, alignment {} and resnum {}"
                                      .format(main_target_sequence, target_species_id, main_domain_alignment, require_resnum))

        tar_path = model_storage.get_tar_path(main_target_sequence,
                                              target_species_id,
                                              main_domain_alignment,
                                              main_domain_alignment.template_id)

        with model_storage.get_model_lock(main_target_sequence, target_species_id,
                                          main_domain_alignment, main_domain_alignment.template_id):
            if not os.path.isfile(tar_path):

                if self.yasara_dir is None:
                    raise InitError("yasara dir is not set")

                with ModelingContext(self.yasara_dir) as context:

                    self._prepare_template(context, main_domain_alignment.template_id.pdbid)

                    # If the template is the same as the target, do no modeling:
                    if main_domain_alignment.get_template_sequence() == context.get_sequence(main_domain_alignment.template_id.chain_id) and \
                            main_domain_alignment.get_percentage_identity() >= 100.0:

                        main_domain_alignment.target_id = model_storage.get_sequence_id(main_target_sequence)

                        tar_path = self._wrap_template(main_target_sequence, target_species_id,
                                                       main_domain_alignment, main_domain_alignment.template_id)
                        return tar_path


                    context.set_main_target(main_target_sequence, target_species_id,
                                         main_domain_alignment.template_id.chain_id)

                    chain_alignments = self._make_alignments(main_target_sequence, target_species_id,
                                                             main_domain_alignment, context, require_resnum)

                    # Delete chains that aren't in the alignment set:
                    for chain_id in context.get_chain_ids():
                        if chain_id not in chain_alignments:
                            context.delete_chain(chain_id)

                    _log.debug("final alignments: {}".format([(chain_id, chain_alignments[chain_id])
                                                              for chain_id in context.get_chain_ids()]))
                    _log.debug("final template {} {}".format(context.template_pdbid,
                                                             [(chain_id, context.get_sequence(chain_id))
                                                              for chain_id in context.get_chain_ids()]))

                    tar_path = self._model_run(main_domain_alignment, chain_alignments, context, main_target_sequence, require_resnum)

            return tar_path
Beispiel #13
0
    def _get_hits(self, range_, template_id):
        if self.template_blast_databank is None:
            raise InitError("blast databank is not set")

        blast_hits = blaster.blastp(range_.get_sub_sequence(), self.template_blast_databank)
        _log.debug("{} blast hits to filter".format(len(blast_hits)))

        count_template_hits = 0
        good_hits = []
        for hit_id in blast_hits:
            for alignment in blast_hits[hit_id]:
                hit_template_id = TemplateID(alignment.get_hit_accession_code(),
                                             alignment.get_hit_chain_id())
                if template_id is not None and hit_template_id != template_id:
                    continue

                count_template_hits += 1

                if template_id is None and blacklister.is_blacklisted(alignment.get_hit_accession_code()):
                    continue

                if not dssp.has_secondary_structure(hit_template_id):
                    continue

                # Replace the blast hit's alignment with the kmad alignment.
                template_secstr = dssp.get_secondary_structure(hit_template_id)
                template_sequence = dssp.get_sequence(hit_template_id)
                try:
                    kmad_alignment = kmad_aligner.align(template_sequence, template_secstr,
                                                        range_.get_sub_sequence())
                except:
                    _log.warn(traceback.format_exc())

                    # If kmad fails, then skip this one :(
                    continue
                alignment.full_query_sequence = range_.sequence
                alignment.query_start = range_.start + 1
                alignment.query_end = range_.end
                alignment.subject_start = 1
                alignment.subject_end = len(template_sequence)
                alignment.query_alignment = kmad_alignment.target_alignment
                alignment.subject_alignment = kmad_alignment.template_alignment

                if alignment.get_percentage_identity() >= get_min_identity(alignment.count_aligned_residues()):
                    good_hits.append(alignment)

        if count_template_hits == 0 and template_id is not None:
            _log.warning("domain sequence {} has no suitable hits with {}".format(range_.get_sub_sequence(), template_id))
            return []

        return good_hits
Beispiel #14
0
    def get_sequence(self, ac):
        if self.url is None:
            raise InitError("uniprot url is not set")

        fasta_url = self.url + '/' + ac + '.fasta'

        _log.debug(fasta_url)

        r = requests.get(fasta_url)
        while r.status_code == 500:
            r = requests.get(fasta_url)

        r.raise_for_status()

        fa = parse_fasta_from_string(r.text)

        return fa.values()[0]
Beispiel #15
0
    def _filter_forbidden_ranges(self, ranges):

        if self.forbidden_interpro_domains is None:
            raise InitError("forbidden ranges not set")

        forbidden = []
        for range_ in ranges:
            if range_.ac in self.forbidden_interpro_domains:
                forbidden.append(range_)

        passed = []
        for range_ in ranges:
            overlapping = filter(lambda r: r.overlaps_with(range_), forbidden)
            if len(overlapping) <= 0:
                passed.append(range_)

        return passed
Beispiel #16
0
    def _merge_similar_ranges(self, ranges):
        if self.similar_ranges_min_overlap_percentage is None or \
                self.similar_ranges_max_length_difference_percentage is None:
            raise InitError("similar range percentages not set")

        ranges = sorted(ranges, cmp=lambda r1, r2: r1.is_left_from(r2))

        i = 0
        while i < len(ranges):
            overlapping_indices = []
            for j in range(i + 1, len(ranges)):
                if ranges[j].overlaps_with(ranges[i]):
                    overlapping_indices.append(j)

            # important, rightmost must go first!
            # Because we're going to remove ranges from the list.
            overlapping_indices = sorted(overlapping_indices, reverse=True)
            for j in overlapping_indices:

                percentage_overlap = ranges[i].get_percentage_overlap(
                    ranges[j])
                percentage_length_difference = (
                    (100.0 *
                     abs(ranges[i].get_length() - ranges[j].get_length())) /
                    max(ranges[i].get_length(), ranges[j].get_length()))

                if percentage_overlap > self.similar_ranges_min_overlap_percentage and \
                        percentage_length_difference < self.similar_ranges_max_length_difference_percentage:

                    # Replace the two ranges by a single merged one:
                    _log.debug(
                        "merging {} with {}, they have {} % length difference".
                        format(ranges[i], ranges[j],
                               percentage_length_difference))
                    merged = ranges[i].merge_with(ranges[j])
                    ranges = (ranges[:i] + [merged] + ranges[i + 1:j] +
                              ranges[j + 1:])
            i += 1

            # Make list shorter to save time:
            ranges = self._remove_duplicate_ranges(ranges)

        return ranges
Beispiel #17
0
    def _find_target_sequences(self, template_chain_sequence, target_species_id):
        if self.uniprot_databank is None:
            raise InitError("species databank dir not set")

        target_sequences = {}

        hits = blaster.blastp(template_chain_sequence, self.uniprot_databank)
        for hit_id in hits:
            if not hit_id.endswith('_' + target_species_id.upper()):
                continue

            for alignment in hits[hit_id]:
                ac = alignment.get_hit_accession_code()
                pid = alignment.get_percentage_identity()
                pcov = alignment.get_percentage_coverage()
                if pid > 70.0:
                    if pcov > 90.0:
                        target_sequences[ac] = uniprot.get_sequence(ac)
        return target_sequences
Beispiel #18
0
    def blastp(self, sequence, databank):
        if self.blastp_exe is None:
            raise InitError("blastp executable is not set")

        input_path = tempfile.mktemp()
        output_path = tempfile.mktemp()

        write_fasta(input_path, {'target': sequence})

        cmd = [
            self.blastp_exe, '-query', input_path, '-db', databank, '-outfmt',
            '5', '-out', output_path
        ]

        _log.debug("{}".format(cmd))

        try:
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 cwd='/')
            p.wait()

            if p.returncode != 0:
                err_msg = p.stderr.read().decode('ascii')
                if err_msg.startswith(
                        "BLAST Database error: No alias or index file found for protein database"
                ):
                    raise RecoverableError(err_msg)

                raise RuntimeError("%s for databank %s, sequence %s" %
                                   (err_msg, databank, sequence))

            with open(output_path, 'r') as f:
                xml_str = f.read()
        finally:
            for path in [input_path, output_path]:
                if os.path.isfile(path):
                    os.remove(path)

        return self._parse_alignments(xml_str, sequence, databank)
Beispiel #19
0
    def list_models(self,
                    target_sequence,
                    species_id,
                    required_resnum=None,
                    template_id=None):
        if self.model_dir is None:
            raise InitError("model directory is not set")

        sequence_id = self.get_sequence_id(target_sequence)

        species_id = species_id.upper()

        if template_id is None:
            wildcard = "%s_%s_*.tgz" % (sequence_id, species_id)
        else:
            wildcard = "%s_%s_*_%s-%s.tgz" % (sequence_id, species_id,
                                              template_id.pdbid,
                                              template_id.chain_id)

        wildcard = os.path.join(self.model_dir, wildcard)

        paths = glob(wildcard)
        paths = [path for path in paths if '_error' not in path]

        if required_resnum is None:
            return paths
        else:
            matching_paths = []
            for path in paths:
                name = os.path.splitext(os.path.basename(path))[0]
                range_ = name.split('_')[2]

                start, end = range_.split('-')
                start = int(start)
                end = int(end)
                if required_resnum >= start and required_resnum <= end:
                    matching_paths.append(path)

            return matching_paths
Beispiel #20
0
    def _prepare_context(self, template_pdbid):
        if self.yasara_dir is None:
            raise InitError("yasara dir is not set")

        context = ModelingContext(self.yasara_dir)

        self._init_template(template_pdbid, context)
        try:
            self._oligomerize_template(context)
        except:
            self._init_template(template_pdbid, context)

        try:
            self._build_template_symmetry_residues(context)
        except:
            pass

        self._delete_solvent_residues(context)
        self._fix_template_errors(context)

        context.yasara.CleanObj(context.template_obj)
        return context
Beispiel #21
0
    def _run_kmad(self, input_path, output_path, gap_open, gap_extend,
                  modifier):

        if self.kmad_exe is None:
            raise InitError("kmad executable is not set")

        cmd = [
            self.kmad_exe, '-i', input_path, '-o', output_path, '-g',
            '%.1f' % gap_open, '-e',
            '%.1f' % gap_extend, '-s',
            '%.1f' % modifier, '-c', '4'
        ]

        _log.debug(cmd)

        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        p.wait()

        if p.returncode != 0:
            raise RuntimeError(p.stderr.read().decode('ascii'))
Beispiel #22
0
    def get_tar_path_from_name(self, name):
        if self.model_dir is None:
            raise InitError("model directory is not set")

        return os.path.join(self.model_dir, name + '.tgz')
Beispiel #23
0
    def get_domain_alignments(self,
                              target_sequence,
                              require_resnum=None,
                              template_id=None):

        if self.min_percentage_coverage is None:
            raise InitError("min percentage coverage is not set")

        interpro_ranges = interpro.get_domain_ranges(target_sequence)
        _log.debug("{} ranges from interpro".format(len(interpro_ranges)))

        sample_ranges = self._filter_forbidden_ranges(interpro_ranges)

        if require_resnum is not None:
            sample_ranges = filter(
                lambda r: r.includes_residue(require_resnum), sample_ranges)
            _log.debug("{} ranges have residue {}".format(
                len(sample_ranges), require_resnum))

        # Add the whole sequence as a range too:
        sample_ranges.append(
            SequenceRange(0, len(target_sequence), target_sequence))

        ok_ranges_alignments = {}
        best_ranges_alignments = {}
        checked_ranges = []

        while len(sample_ranges) > 0:

            merged_sample_ranges = self._merge_similar_ranges(sample_ranges)

            _log.debug("sampling {} ranges".format(len(merged_sample_ranges)))

            # Check the largest ranges first. If that yields, then the smaller ones don't matter.
            for range_ in sorted(merged_sample_ranges,
                                 key=lambda r: r.get_length(),
                                 reverse=True):

                if range_ in checked_ranges:
                    continue  # already passed this one
                checked_ranges.append(range_)

                if any([r.encloses(range_) for r in best_ranges_alignments]):
                    continue  # we already have a larger enclosing range

                # These can differ per range:
                best_hit = None
                last_resort_hit = None

                hit_candidates = self._get_hits(range_, template_id)

                _log.debug('trying range: {} against {} hits'.format(
                    range_, len(hit_candidates)))

                for hit_candidate in hit_candidates:

                    hit_range = hit_candidate.get_query_range()
                    if require_resnum is not None:
                        if not hit_candidate.is_query_residue_covered(
                                require_resnum):
                            _log.debug(
                                "hit with {} on {} does not cover residue {}".
                                format(hit_candidate.get_hit_accession_code(),
                                       hit_range, require_resnum))
                            continue

                    if self._alignment_ok_for_range(range_, hit_candidate):
                        _log.debug("hit with {} {} is ok".format(
                            hit_candidate.get_hit_accession_code(), hit_range))

                        # This range made an OK alignment, so at least store it for later usage:
                        template_id = TemplateID(
                            hit_candidate.get_hit_accession_code(),
                            hit_candidate.get_hit_chain_id())
                        ok_ranges_alignments[hit_range] = DomainAlignment(
                            hit_candidate.query_alignment,
                            hit_candidate.subject_alignment, hit_range,
                            template_id)

                        if hit_candidate.get_percentage_coverage(
                        ) > self.min_percentage_coverage:

                            _log.debug(
                                "coverage is high enough for {} {}".format(
                                    hit_candidate.get_hit_accession_code(),
                                    hit_range))

                            if best_hit is None or self._is_better_than(
                                    hit_candidate, best_hit):

                                _log.debug("{} is better than {}".format(
                                    hit_candidate, best_hit))
                                best_hit = hit_candidate
                        else:
                            last_resort_hit = hit_candidate

                if best_hit is None:
                    best_hit = last_resort_hit

                if best_hit is not None:

                    # Remove any smaller ranges that this one encloses:
                    best_ranges_alignments = self._remove_enclosing(
                        range_, best_ranges_alignments)

                    template_id = TemplateID(best_hit.get_hit_accession_code(),
                                             best_hit.get_hit_chain_id())

                    hit_range = best_hit.get_query_range()
                    _log.debug(
                        "passing best hit with template {} with range {}".
                        format(template_id, hit_range))

                    best_ranges_alignments[hit_range] = DomainAlignment(
                        best_hit.query_alignment, best_hit.subject_alignment,
                        hit_range, template_id)
                else:
                    _log.debug("no hit for range {}".format(range_))

            # After iterating the sample ranges, prepare for the next round:
            sample_ranges = self._clean_search_space(checked_ranges,
                                                     sample_ranges,
                                                     ok_ranges_alignments)

        return best_ranges_alignments.values()