def _parse(self):
        file_fd    = File(self._dsspfile)
        read       = False
        continuity = -1000
        readline   = 0
        for line in file_fd.read():
            if line.startswith("  #  RESIDUE AA STRUCTURE BP1 BP2  ACC"):
                read = True
                continue
            if read:
                if line[13:14] != '!':
                    res_num = int(line[6:10].strip())
                    ss      = line[16:17] if line[16:17] != ' ' else '-'
                    buried  = int(line[35:38].strip())
                    aa      = line[13:15].strip()

                    self._dsspdata.append(DSSP(secondary_structure = ss,
                                               accessibility       = buried,
                                               amino               = aa))
                    self._dsspdata[-1].add_hydrogen_links(line[39:50],
                                                          line[50:61],
                                                          line[61:72],
                                                          line[72:84])
                    if readline > 0:
                        if res_num != continuity + 1:
                            self._gapped = True
                        continuity = res_num
                    readline += 1
                else:
                    msg = "truncated chain!{0}\n".format(self._dsspfile)
                    sys.stderr.write(msg)
                    SBIg.warn(self, msg)
                    self._gapped = True
        file_fd.close()
    def subset(self,
               sequence_ids,
               new_fasta_file,
               all_but=False,
               prefix_size=None,
               index=False,
               force=None):
        '''
        Creates a new {Fasta} with the requested subset of sequences.

        @param:    sequence_ids
        @pdef:     sequence identifier(s)
        @ptype:    {String}, {List} or {Set}

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    all_but
        @pdef:     Flag. Instead of retrieving the given ids, we retrieve all
                   except the given ids.
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    prefix_size
        @pdef:     maximum characters for the prefix. If _None_, all the
                   characters are included.
        @pdefault: _None_
        @ptype:    {Integer}

        @param:    index
        @pdef:     create the index file also, in case it does exist
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @raises: {AttributeError} if sequence_ids is not a valid type.
        @return: {Fasta}
        '''
        sequences = self.retrieve(sequence_ids, all_but, prefix_size)
        fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force)
        if self.has_index and index:
            idxfile = File(self.index_file)
            newidx = File(fasta_file.file.full + '.idx', 'w')
            seqids = set(fasta_file.sequence_identifiers)
            for idx in idxfile.read():
                if idx.split()[0].strip('>') in seqids:
                    newidx.write(idx)
            idxfile.close()
            newidx.close()
            fasta_file.index_file = newidx.full
        return fasta_file
Beispiel #3
0
    def read_compacted_blast(compacted_blast_file):
        '''
        Read data from a printed compacted blast into {BlastResult}.
        Not all options will be available in that new object.

        @param:    compacted_blast_file
        @pdef:     file of the compacted blast print
        @ptype:    {String}

        @return: {BlastResult}
        '''
        from BlastHit import BlastHit
        query_name, query_sequence = None, None
        version, matrix, database = None, None, None
        gap_open, gap_extend, self_hit = None, None, None

        br = None

        cbf = File(compacted_blast_file)
        for line in cbf.read():
            if line.startswith('#'):
                if line.startswith('#Query:'):
                    query_name = line.strip().split()[-1]
                if line.startswith('#Query Sequence:'):
                    query_sequence = line.strip().split()[-1]
                if line.startswith('#Blast Version:'):
                    version = line.strip().split()[-1]
                if line.startswith('#Search on matrix:'):
                    matrix = line.strip().split()[-1]
                if line.startswith('#Gap open penalty:'):
                    gap_open = line.strip().split()[-1]
                if line.startswith('#Gap extension penalty:'):
                    gap_extend = line.strip().split()[-1]
                if line.startswith('#Database searched:'):
                    database = line.strip().split()[-1]
                if line.startswith('#Self Hit is omitted:'):
                    self_hit = line.strip().split()[-1]
            else:
                if br is None:
                    if version is None:
                        bh = None
                    else:
                        bh = BlastHeader(version, matrix, gap_open, gap_extend,
                                         database, self_hit)
                    br = BlastResult(query_name, query_sequence, bh)
                d = line.strip().split()
                hit = BlastHit(
                    [d[2], d[3]], [d[8], d[9]],
                    [int(x) for x in d[10].split(',')[0].split(':')], 1,
                    [d[4], d[5], d[6], d[7]])
                br.add_hit(hit)
        cbf.close()

        return br
Beispiel #4
0
    def release(self):
        '''
        Retrieves release data for the database.
        Not according to the DB release, but to when we downloaded it.

        @returns: {Dictionary}
        '''
        if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)):
            f = File(os.path.join(self.local, self._CONTROL_FILE))
            data = json.loads(f.read())
            f.close()
        else:
            data = self._RELEASE
        return data
Beispiel #5
0
    def items(self):
        '''
        Loops through the items of the database

        @yields: Object depending on the database.
        '''
        if not self.has_local:
            SBIg.throw(self, 'A local database needs to be build first',
                       IOError)

        for ifile in self._ITEM_FILES:
            ifile = os.path.join(self.local, ifile)
            f = File(ifile)
            for line in f.read():
                yield self._DBOBJECT.grab(line.strip())
            f.close()
Beispiel #6
0
    def correct_hit_count(self,
                          count_hit_file=None,
                          count_query_file=None,
                          return_correction_dict=False):
        '''
        Corrects the starting point of the hits and the query, if needed.
        Why?
        When blasting vs. PDB (for example), sometimes the hit positions given
        by blast are wrong, as the blast always consider the first position of
        the hit sequence as 1 and PDB does not.
        Even more, the position reference doesn't even need to be a number.
        As the specific location in the PDB is important, we need to adapt our
        blasts so than we can read that data.
        Keep in mind that hits and query must be corrected together in this step,
        as this function cannot be called twice for a same instance.

        @param:    count_hit_file
        @pdef:     file containing the idex data for the query database
                   each sequence in this file will have a format such as:
                   >3K2K_A -7 ;-6 ;-5 ;-4 ;-3 ;-2 ;-1 ;0 ;1 ;2 ;3 ;4 ;5 ;6 ;7 ...
        @ptype:    {String}

        @param:    count_query_file
        @pdef:     sometimes we might also need to correct the query (if PDB vs.
                   PDB). Same format as count_hit_file. They might be the same file.
        @ptype:    {String}

        @param:    return_correction_dict
        @pdef:     instead of actually executing the correction, it only returns
                   the dictionary for further use.
        @pdefault: _False_
        @ptype:    {Boolean}

        @raises: {IOError} if the correction index file does not exist.
        @raises: {AttributeError} if the BlastResult does not contain any BlastHit.
        @raises: {BlastError} if it has been called before for this instance.

        '''
        if not self.has_hits:
            SBIg.warn(
                self,
                "BlastResult of {0} has no hits to correct".format(self.query))
            return

        if self.are_hits_corrected:
            be = BlastExe.BlastError()
            raise be.corrected_hits()

        SBIg.alert('debug', self,
                   'Correcting indexes for {0}'.format(self.query))
        cfile = File(count_hit_file)
        cq = False

        codes_of_interest = set([hit.sequenceID for hit in self.raw_hits])
        if count_query_file == count_hit_file:
            codes_of_interest.add(self.query)
            count_query_file = None
            cq = True

        start_index_dic = {}
        for line in cfile.read():
            if len(line.strip()) > 0:
                k = line.split('\t')
                if k[0].lstrip('>') in codes_of_interest:
                    start_index_dic[k[0].lstrip('>')] = k[1].strip().split(';')
        cfile.close()

        if count_query_file is not None:
            cfile = File(count_query_file)
            for line in cfile.read():
                if len(line.strip()) > 0:
                    k = line.split('\t')
                    if k[0].lstrip('>') == self.query:
                        start_index_dic[k[0].lstrip('>')] = k[1].strip().split(
                            ';')
            cfile.read().close()
            cq = True

        if cq:
            SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query))
            self._query_index = start_index_dic[self.query]

        if return_correction_dict:
            return start_index_dic

        for hit in self._hits:
            # This tests between the options PDB/PDB_ID or PDB_ID in case
            # the TAB file has different codification
            h = hit.sequenceID
            hit_ID = h if h in start_index_dic else h.split("/")[-1]
            SBIg.alert('debug', self, '\tFixing {0}'.format(hit_ID))
            hit.correct_hit_count(new_index=start_index_dic[hit_ID])
            if cq:
                SBIg.alert('debug', self,
                           '\tFixing Query {0}'.format(self.query))
                hit.correct_query_count(new_index=start_index_dic[self.query])

        self._correctedHits = True
Beispiel #7
0
class CDhitList(StorableObject):
    '''
    List of cd-hit clusters.

    '''
    def __init__(self, cdhit_file=None):
        '''
        @param:    cdhit_file
        @pdef:     name of the cd-hit output file
        @pdefault: _None_. Create an empty list
        @ptype:    {String}

        '''
        self._clusters = []
        self._allseqids = {}
        if cdhit_file is not None:
            self._file = File(file_name=cdhit_file)
        else:
            self._file = None

        if self._file is not None:
            self._parse_file()

    ##############
    # ATTRIBUTES #
    ##############
    @property
    def clusters(self):
        '''
        List of cd-hit clusters.

        @return: {List} of {CDhit}
        '''
        return self._clusters

    ###########
    # METHODS #
    ###########
    def get_cluster4sequence(self, sequence):
        '''
        Retrieve a cluster for a given sequence. _None_ if the sequence is not
        found.

        @param:    sequence
        @pdef:     name of the query sequence
        @ptype:    {String}

        @return: {CDhit}
        '''
        if sequence in self._allseqids:
            return self._clusters[self._allseqids[sequence]]
        else:
            return None

    def is_in_cluster(self, sequence):
        '''
        Evaluate if the sequence is in a cluster.

        @param:    sequence
        @pdef:     name of the query sequence
        @ptype:    {String}

        @return: {String} as 'N' if no, 'H' if yes and 'M' if cluster master
        '''
        c = self.get_cluster4sequence(sequence)
        if c is None:
            return 'N'
        else:
            return 'M' if c.is_master(sequence) else 'H'

    def add_cluster(self, cluster):
        '''
        Add a cd-hit cluster to the object.

        @param:    cluster
        @pdef:     new cd-hit cluster to add
        @ptype:    {CDhit}
        '''
        self._clusters.append(cluster)

    def add_sequence2cluster(self, sequence, cluster_id=None):
        '''
        Add a new sequence to a given cluster.

        @param:    sequence
        @pdef:     name of the query sequence
        @ptype:    {String}

        @param:    cluster_id
        @pdef:     identifier of the cluster
        @pdefault: _None_. Refers to the last added cluster.
        @ptype:    {String}
        '''
        if cluster_id is None:
            self.clusters[-1].add_sequence(sequence)
            self._allseqids[sequence.name] = len(self.clusters) - 1
        else:
            for x in range(len(self._clusters)):
                if self._clusters[x].identifier == cluster_id:
                    self._clusters[x].add_sequence(sequence)
                    self._allseqids[sequence.name] = x
                    break

    def dictionary_role_summary(self):
        '''
        Creates a dictionary separating master sequences and homolog sequences.

        @return: {Dictionary}
        '''
        data = {'master': [], 'homolog': []}
        for c in self.clusters:
            data['master'].append(c.master.name)
            for s in c.sequences:
                data['homolog'].append(s)
        return data

    def merge_clusters(self, cluster_file):
        '''
        When using an intermediate state to cluster by homology,
        the result of the second clustering is a clustering of clusters.
        We need to transform this into the original sequences

        @param:    cluster_file
        @pdef:     name of the second-step cluster output
        @ptype:    {String}
        '''
        clustlist = CDhitList(cluster_file)
        newlist = CDhitList()
        cluster_re = re.compile('Cluster\s+(\d+)')
        for cl in clustlist.clusters:
            c = CDhit(cluster_id=cl.identifier)
            newlist.add_cluster(c)
            cnum = int(cluster_re.search(cl.master.name).group(1))
            oldclust = self.clusters[cnum]
            newlist.add_sequence2cluster(sequence=oldclust.master)
            for s in oldclust.sequences:
                newlist.add_sequence2cluster(sequence=oldclust.sequences[s])
            for s in cl.sequences:
                idclust = cl.sequences[s]
                cnum = int(cluster_re.search(idclust.name).group(1))
                oldclust = self.clusters[cnum]
                master = oldclust.master
                master.homology = idclust.homology
                newlist.add_sequence2cluster(sequence=master)
                for s in oldclust.sequences:
                    h = oldclust.sequences[s]
                    h.homology = int(h.homology * float(idclust.homology) / 10)
                    newlist.add_sequence2cluster(sequence=h)

        self._clusters = newlist._clusters
        self._allseqids = newlist._allseqids

    ###################
    # PRIVATE METHODS #
    ###################
    def _parse_file(self):
        '''
        Read the cd-hit output file into a {CDhitList}

        '''
        homolog_re = re.compile('(\d+)aa,\s+\>([\s\w]+)\.{3}')
        for line in self._file.read():
            if line.startswith('>'):
                c = CDhit(cluster_id=line.split()[-1].strip())
                self.add_cluster(c)
            else:
                data = homolog_re.search(line)
                d = line.split()
                h = CDhitHomolog(name=data.group(2),
                                 length=data.group(1),
                                 homology=d[-1])
                self.add_sequence2cluster(sequence=h)
        self._file.close()

    def __len__(self):
        return len(self._clusters)

    def __repr__(self):
        text = []
        for c in self.clusters:
            text.append('{0}'.format(c))
        return '\n'.join(text)
Beispiel #8
0
    def parse(query_sequence, blast_output_file, self_hit, hitid_format):
        '''
        Processes a blast xml formated output into a {BlastResult} object.

        @param:    query_sequence
        @pdef:     sequence of the query protein/nucleotide.
        @ptype:    {String}

        @param:    blast_output_file
        @pdef:     output file from BLAST.
        @ptype:    {String}

        @param:   self_hit
        @pdef:     when _True_ if the query is found in the database, it is
                   retrieved.
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    hitid_format
        @pdef:     format of the name of the hit. If given a wrong option,
                   it defaults to 'single'
        @pdefault: 'single'
        @poptions: 'single' -> first word of the name,
                   'double' -> first two words of the hit name,
                   'all'    -> all the text in the hit name
        @ptype:    {String}

        @raises: {BlastError} if there are problems while parsing the XML file.
        @returns: {BlastResult}
        '''
        f = File(blast_output_file)
        s = BeautifulSoup(f.read())

        h = BlastHeader(version    = str(s.find('blastoutput_version').string),
                        matrix     = str(s.find('parameters_matrix').string),
                        gap_open   = int(s.find('parameters_gap-open').string),
                        gap_extend = int(s.find('parameters_gap-extend').string),
                        database   = str(s.find('blastoutput_db').string),
                        self_hit   = self_hit)
        b = BlastResult(query_name     = str(s.find('blastoutput_query-def').string),
                        query_sequence = query_sequence,
                        header         = h)

        SBIg.alert('debug', BlastParser(), b.str_blast_header())

        error_bool = False
        error_str  = []
        for iteration in s.find_all('iteration'):
            iternum = int(iteration.find('iteration_iter-num').string)
            for hit in iteration.find_all('hit'):
                hit_name  = BlastParser.hit_name(str(hit.find('hit_def').string), hitid_format)
                hit_lenth = int(hit.find("hit_len").string)
                for subhit in hit.find_all("hsp"):
                    data = BlastParser.parse_subhit(subhit)
                    r = BlastHit(hit            = [hit_name, hit_lenth],
                                 sequences      = [data['qs'], data['hs'], data['sc']],
                                 sequence_inits = [data['qp'], data['hp']],
                                 iteration      = iternum,
                                 stats          = [data['hi'], data['h+'],
                                                   data['hg'], data['ev']])
                    if not BlastParser.same_query_hit_names(b.query, hit_name, self_hit):
                        dbug_info = 'Added hit {0} in iteration {1}'
                        SBIg.alert('debug', BlastParser(), dbug_info.format(hit_name, r.iteration))
                        b.add_hit(r)
                        if not r.are_segments_ok:
                            error_bool = True
                            error_str.append("Check the alignment's fragmentation")
                            error_str.append("for the query %s with %s\n".format(b.query, hit_name))
                            error_str.append("{0}\n".format(r))
        b.set_last_iteration()
        if error_bool:
            SBIg.warn(BlastParser(), error_str)
            be = BlastError()
            raise be.parse_error()
        return b