Esempio n. 1
0
    def __init__(self, fasta_file, auto_load=10):
        '''
        @param:    fasta_file
        @pdef:     name of the FASTA file.
        @ptype:    {String} or {File}

        @@param:   auto_load
        @pdef:     maximum number of sequences to autoload.
        @pdefault: 10
        @ptype:    {Integer}
        '''
        if isinstance(fasta_file, basestring):
            self._file = File(file_name=fasta_file, action='r')
        elif isinstance(fasta_file, File):
            self._file = File(file_name=fasta_file.full, action='r')
        else:
            raise AttributeError('Check the input of the Fasta object')

        self._sequences = []
        self._sequenceID = {}

        self._total_sequences = 0
        self._loaded = False
        self._auto_load = auto_load
        self._check_multifasta()

        self._index_file = None
        self._check_index()
Esempio n. 2
0
    def subset(self,
               sequence_ids,
               new_fasta_file,
               all_but=False,
               prefix_size=None,
               index=False,
               force=None):
        '''
        Creates a new {Fasta} with the requested subset of sequences.

        @param:    sequence_ids
        @pdef:     sequence identifier(s)
        @ptype:    {String}, {List} or {Set}

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    all_but
        @pdef:     Flag. Instead of retrieving the given ids, we retrieve all
                   except the given ids.
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    prefix_size
        @pdef:     maximum characters for the prefix. If _None_, all the
                   characters are included.
        @pdefault: _None_
        @ptype:    {Integer}

        @param:    index
        @pdef:     create the index file also, in case it does exist
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @raises: {AttributeError} if sequence_ids is not a valid type.
        @return: {Fasta}
        '''
        sequences = self.retrieve(sequence_ids, all_but, prefix_size)
        fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force)
        if self.has_index and index:
            idxfile = File(self.index_file)
            newidx = File(fasta_file.file.full + '.idx', 'w')
            seqids = set(fasta_file.sequence_identifiers)
            for idx in idxfile.read():
                if idx.split()[0].strip('>') in seqids:
                    newidx.write(idx)
            idxfile.close()
            newidx.close()
            fasta_file.index_file = newidx.full
        return fasta_file
    def _process(self, update=False):
        '''
        Transform the source files into the final local db files.

        @param:    update
        @pdef:     toggles between create and update processing
        @pdefault: _False_
        @ptype:    {Boolean
        '''
        if update:
            old = self._RELEASE['total_items'].copy()
        j = 0
        for i in range(len(self._SOURCES)):
            dfilen = os.path.join(self.local, self._SOURCES[i])
            ofilen = os.path.join(self.local, self._MANDATORY_FILES[j])
            ffilen = os.path.join(self.local, self._MANDATORY_FILES[j + 1])
            if not os.path.isfile(dfilen):
                continue
            SBIg.alert('verbose', self, 'Parsing:       {0}'.format(dfilen))
            SBIg.alert('verbose', self, 'DB file to:    {0}'.format(ofilen))
            SBIg.alert('verbose', self, 'Fasta file to: {0}'.format(ffilen))
            dfile = File(dfilen)
            ofile = File(ofilen, 'w', update)
            ffile = File(ffilen, 'w', update)
            protein = None
            for protein in Connect._parse_uniprot(dfile):
                pname = protein.entry_name
                pvers = protein.version
                SBIg.alert('verbose', self, 'Protein: {0}'.format(pname))
                if not update:
                    self._RELEASE['total_items'][pname] = pvers
                else:
                    if pname not in self._RELEASE['total_items']:
                        self._RELEASE['new_items'][pname] = pvers
                    else:
                        del (old[pname])
                        if self._RELEASE['total_items'][pname] != pvers:
                            self._RELEASE['update_items'][pname] = pvers

                ffile.write(protein.sequence.format('FASTA') + '\n')
                ofile.write(protein.json() + '\n')
            j += 2
            dfile.close()
            ofile.close()
            ffile.close()

        if update:
            self._RELEASE['total_items'].update(self._RELEASE['new_items'])
            self._RELEASE['total_items'].update(self._RELEASE['update_items'])
            self._RELEASE['deleted_items'] = old
            for k in self._RELEASE['deleted_items']:
                del (self._RELEASE['total_items'][k])
    def make_PDBseq(self, log_file, resolution_threshold=None):
        if not self.has_local:
            raise NameError(
                'A local PDB database must be defined to do create a PDBseq database.'
            )
        outdir = self.PDBseq if self.PDBseq is not None else os.curdir

        Path.mkdir(self.PDBseq)
        fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'),
                          action='w',
                          overwrite=True)
        fasta_fd = fasta_file.descriptor
        idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'),
                        action='w',
                        overwrite=True)
        idx_fd = idx_file.descriptor
        # if resolution_threshold is not None:
        #     filtered_file_name = self.get_PDBseq_filtered(resolution_threshold)
        #     filtered_file      = File(file_name = filtered_file_name, action = 'w', overwrite = True)
        #     filtered_fd        = filtered_file.descriptor
        #     resolutions        = self.get_resolutions(resolution_threshold = resolution_threshold)
        log_file = File(file_name=log_file, action='w', overwrite=True)
        log_idx = log_file.descriptor

        for pdb_file in self.localPDBs:
            log_idx.write("Reading File: {0}\n".format(pdb_file))
            newPDB = PDB(pdb_file=pdb_file, dehydrate=True)
            fasta_idx = newPDB.FASTA_IDX(nucleotide=False)
            if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']):
                log_idx.write(
                    'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n'
                    .format(newPDB.id))
            if len(fasta_idx['FASTA']) > 0:
                log_idx.write('\tPrinting FASTA and IDX...\n')
            else:
                log_idx.write('\tProblably just a nucleotide PDB...\n')
            for c in range(len(fasta_idx['FASTA'])):
                sequence = fasta_idx['FASTA'][c].split('\n')[1]
                sequence = sequence.replace('X', '').replace('x', '')
                if len(sequence) > 0:
                    fasta_fd.write(fasta_idx['FASTA'][c] + "\n")
                    if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca:
                        filtered_fd.write(fasta_idx['FASTA'][c] + "\n")
                    idx_fd.write(fasta_idx['IDX'][c] + "\n")
            del (newPDB)

        #CLOSE & END
        fasta_file.close()
        idx_file.close()
        if resolution_threshold is not None:
            filtered_fd.close()
Esempio n. 5
0
    def build(file_name, sequence_id, sequence, force=None):
        '''
        Creates a Fasta object and a FASTA file from a sequence.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_id
        @pdef:     name of the sequence
        @ptype:    {String}

        @param:    sequence
        @pdef:     sequence
        @ptype:    {String} or {List}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        newSeq = Sequence(sequence_id=sequence_id, sequence=sequence)
        newFasta.write(newSeq.format('FASTA'))
        newFasta.close()
        return Fasta(fasta_file=newFasta.full, auto_load=0)
Esempio n. 6
0
    def build_multifasta(file_name, sequence_list, force=None):
        '''
        Creates a Fasta object and a FASTA file. For multiple sequences.

        @param:    file_name
        @pdef:     name of the fasta file (with path, if necessary)
        @ptype:    {String}

        @param:    sequence_list
        @pdef:     list of sequences to create the FASTA from.
        @ptype:    {List} or {Set} of {Sequence}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        newFasta = File(file_name, 'w', overwrite=force)
        for sequence in sequence_list:
            newFasta.write(sequence.format('FASTA') + '\n')
        newFasta.close()
        fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0)
        return fasta_file
    def _parse(self):
        file_fd    = File(self._dsspfile)
        read       = False
        continuity = -1000
        readline   = 0
        for line in file_fd.read():
            if line.startswith("  #  RESIDUE AA STRUCTURE BP1 BP2  ACC"):
                read = True
                continue
            if read:
                if line[13:14] != '!':
                    res_num = int(line[6:10].strip())
                    ss      = line[16:17] if line[16:17] != ' ' else '-'
                    buried  = int(line[35:38].strip())
                    aa      = line[13:15].strip()

                    self._dsspdata.append(DSSP(secondary_structure = ss,
                                               accessibility       = buried,
                                               amino               = aa))
                    self._dsspdata[-1].add_hydrogen_links(line[39:50],
                                                          line[50:61],
                                                          line[61:72],
                                                          line[72:84])
                    if readline > 0:
                        if res_num != continuity + 1:
                            self._gapped = True
                        continuity = res_num
                    readline += 1
                else:
                    msg = "truncated chain!{0}\n".format(self._dsspfile)
                    sys.stderr.write(msg)
                    SBIg.warn(self, msg)
                    self._gapped = True
        file_fd.close()
Esempio n. 8
0
 def _save_release(self):
     '''
     Store the release data into a file.
     '''
     f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True)
     f.write(json.dumps(self._RELEASE))
     f.close()
    def _process(self):
        enzymes = self._parse_enzclass() + self._parse_enzymedat()
        enzymes.sort()

        enzFile = File(self._enzfile, 'w', True)
        for e in enzymes:
            enzFile.write(repr(e) + "\n")
        enzFile.close()
Esempio n. 10
0
    def get_FASTA_IDX_by_names_to_file(self, names, outfile):

        fastafile = Fasta(self.PDBseq)
        selectedfasta = fastafile.retrieve(copy.deepcopy(names))
        output_fasta = File(outfile, 'w')
        for sequence in selectedfasta:
            output_fasta.write(sequence.format('FASTA') + "\n")
        output_fasta.close()
        idxfile = self.PDBseq + '.idx'
        output_idx = File(outfile + '.idx', 'w')
        input_idx = File(idxfile, 'r')
        for line in input_idx.descriptor:
            info = line.split()
            pdbname = info[0][1:]
            if pdbname in names:
                output_idx.write(line)
        input_idx.close()
        output_idx.close()
Esempio n. 11
0
 def format2file(self, filename, extension = 'pdb', center = False):
     if extension not in ('pdb', 'js'):
         raise AttributeError('Not accepted extension')
     structure = File('.'.join([filename, extension]), 'w')
     if extension == 'pdb':
         structure.write(self.pdb_format(center = center))
     elif extension == 'js':
         structure.write(self.js_format(center = center))
     structure.close()
Esempio n. 12
0
    def _process(self):

        targets = self._process_targets()
        drugs = self._process_drugs(targets)

        drugFile = File(self._drugfile, 'w', True)
        for d in drugs:
            drugFile.write(repr(d) + "\n")
        drugFile.close()
Esempio n. 13
0
    def _process(self):
        go_dic = {}
        parseFile = File(os.path.join(self.local, self._gfile), 'r')
        go = None
        for line in parseFile.descriptor:
            line = re.sub('\'', '\\\'', line)
            if line.startswith('[Term]'):
                if go is not None:
                    go_dic[go.id] = go
            if line.startswith('id:'):
                go = GOterm(id = line.split()[1].strip())
                continue
            if line.startswith('name:'):
                go.name = " ".join(line.split()[1:]).strip()
                continue
            if line.startswith('namespace:'):
                go.namespace = line.split()[1].strip()
                continue
            if line.startswith('alt_id:'):
                go.alt_id.append(line.split()[1].strip())
                continue
            if line.startswith('is_obsolete:'):
                go.obsolete = True
                continue
            if line.startswith('is_a:'):
                go.parents.add(line.split()[1].strip())
                continue
            if line.startswith('relationship:'):
                go.relations.append((line.split()[1].strip(),line.split()[2].strip()))
                continue
            if line.startswith('[Typedef]'):
                go_dic[go.id] = go
                break
        parseFile.close()

        for go in go_dic:
            go_dic[go].parents = self._search_parents(go_dic, go)

        goFile = File(self._gofile, 'w', True)
        for go in go_dic:
            go_dic[go].parents.add(go)
            goFile.write(str(go_dic[go]) + "\n")
        goFile.close()
Esempio n. 14
0
    def read_compacted_blast(compacted_blast_file):
        '''
        Read data from a printed compacted blast into {BlastResult}.
        Not all options will be available in that new object.

        @param:    compacted_blast_file
        @pdef:     file of the compacted blast print
        @ptype:    {String}

        @return: {BlastResult}
        '''
        from BlastHit import BlastHit
        query_name, query_sequence = None, None
        version, matrix, database = None, None, None
        gap_open, gap_extend, self_hit = None, None, None

        br = None

        cbf = File(compacted_blast_file)
        for line in cbf.read():
            if line.startswith('#'):
                if line.startswith('#Query:'):
                    query_name = line.strip().split()[-1]
                if line.startswith('#Query Sequence:'):
                    query_sequence = line.strip().split()[-1]
                if line.startswith('#Blast Version:'):
                    version = line.strip().split()[-1]
                if line.startswith('#Search on matrix:'):
                    matrix = line.strip().split()[-1]
                if line.startswith('#Gap open penalty:'):
                    gap_open = line.strip().split()[-1]
                if line.startswith('#Gap extension penalty:'):
                    gap_extend = line.strip().split()[-1]
                if line.startswith('#Database searched:'):
                    database = line.strip().split()[-1]
                if line.startswith('#Self Hit is omitted:'):
                    self_hit = line.strip().split()[-1]
            else:
                if br is None:
                    if version is None:
                        bh = None
                    else:
                        bh = BlastHeader(version, matrix, gap_open, gap_extend,
                                         database, self_hit)
                    br = BlastResult(query_name, query_sequence, bh)
                d = line.strip().split()
                hit = BlastHit(
                    [d[2], d[3]], [d[8], d[9]],
                    [int(x) for x in d[10].split(',')[0].split(':')], 1,
                    [d[4], d[5], d[6], d[7]])
                br.add_hit(hit)
        cbf.close()

        return br
Esempio n. 15
0
    def pdb_file(self, value):
        """
        Sets a PDB file if none has been given
        @raise UsedAttributeError
        """
        if self._pdb_file is not None:
            raise AttributeError(
                "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object".format(self._pdb_file.full, value))

        if isinstance(value, File):
            self._pdb_file = value
        else:
            self._pdb_file = File(file_name=value, type='r')
    def _process(self):
        inh = {}
        nodefile = File(file_name = self._nodes, action = 'r')
        for line in nodefile.descriptor:
            line = re.sub('\'', '\\\'', line)
            line_data = line.split('|')
            inh[line_data[0].strip()] = TaxID(line_data[0].strip())
            inh[line_data[0].strip()].parent = line_data[1].strip()
            inh[line_data[0].strip()].rank   = line_data[2].strip()
        nodefile.close()

        namefile = File(file_name = self._names, action = 'r')
        for line in namefile.descriptor:
            line = re.sub('\'', '\\\'', line)
            line_data = line.split('|')
            if line_data[3].strip() == 'scientific name':
                inh[line_data[0].strip()].name = line_data[1].strip()
        namefile.close()

        delefile = File(file_name = self._delet, action = 'r')
        for line in delefile.descriptor:
            data = line.split('|')
            inh[data[0].strip()]     = TaxID(data[0].strip())
            inh[data[0].strip()].old = True
        delefile.close()

        mrgefile = File(file_name = self._merged, action = 'r')
        for line in mrgefile.descriptor:
            data = line.split('|')
            inh[data[0].strip()]     = TaxID(data[0].strip())
            inh[data[0].strip()].old = True
            inh[data[0].strip()].new = data[1].strip()
        mrgefile.close()

        taxFile = File(self._taxid, 'w', True)
        for taxid in inh:
            taxFile.write(str(inh[taxid]) + "\n")
        taxFile.close()
Esempio n. 17
0
    def release(self):
        '''
        Retrieves release data for the database.
        Not according to the DB release, but to when we downloaded it.

        @returns: {Dictionary}
        '''
        if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)):
            f = File(os.path.join(self.local, self._CONTROL_FILE))
            data = json.loads(f.read())
            f.close()
        else:
            data = self._RELEASE
        return data
Esempio n. 18
0
    def write(self, output_file=None, format='PDB', force=None, clean=False):
        """
        Writes the object in a specific format

        @type  output_file: String
        @param output_file: File to write

        @type  format: String
        @param format: Format of the file to print
        """
        outfile = File(
            file_name=output_file, action='w', overwrite=SBIg.decide_overwrite(force))
        if format == 'PDB':
            self._write_PDB_file(pdb_file=outfile, clean=clean)
Esempio n. 19
0
    def print_compacted_blast(self, out_file=None):
        '''
        Print the compacted format of the blast hit.

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_compacted_blast())
            output.close()
        else:
            print self.str_compacted_blast()
    def get_PDBeChem(self, chemID):
        if self.has_local:
            for chem_file in self.localPDBeChems:
                newfile = File(file_name = chem_file, action = 'r')
                if newfile.prefix.upper() == chemID.upper():
                    return chem_file

        # If we do not find it in local (or we do not have a local) we search it on the FTP
        chem_file = chemID.upper() + '.cif'
        source = PDBeChemftp['single'] + chem_file
        try:
            urllib.urlretrieve(source, chem_file)
        except:
            return False
        return os.path.abspath(chem_file)
    def get_PDBeChems(self, chemIDset):
        if isintance(chemIDset, str):
            warnings.warn('For single PDBeChem search the get_PDBeChem function is recomended.')
            yield self.get_PDBeChem(chemIDset)
        else:
            chemIDset = set([x.upper() for x in chemIDset])

        if self.has_local:
            for chem_file in self.localPDBeChems:
                newfile = File(file_name = chem_file, action = 'r')
                if newfile.prefix.lstrip('pdb').upper() in chemIDset:
                    yield chem_file
        else:
            for chemID in chemIDset:
                yield self.get_PDBeChem(chemID)
Esempio n. 22
0
    def items(self):
        '''
        Loops through the items of the database

        @yields: Object depending on the database.
        '''
        if not self.has_local:
            SBIg.throw(self, 'A local database needs to be build first',
                       IOError)

        for ifile in self._ITEM_FILES:
            ifile = os.path.join(self.local, ifile)
            f = File(ifile)
            for line in f.read():
                yield self._DBOBJECT.grab(line.strip())
            f.close()
Esempio n. 23
0
    def get_PDBs(self, pdbIDset):
        if isintance(pdbIDset, str):
            warnings.warn(
                'For single PDB search the get_PDB function is recomended.')
            yield self.get_PDB(pdbIDset)
        else:
            pdbIDset = set([x.upper() for x in pdbIDset])

        if self.has_local:
            for pdb_file in self.localPDBs:
                newfile = File(file_name=pdb_file, action='r')
                if newfile.prefix.lstrip('pdb').upper() in pdbIDset:
                    yield pdb_file
        else:
            for pdbID in pdbIDset:
                yield self.get_PDB(pdbID)
    def __init__(self, cif_file):
        self._file    = File(file_name = cif_file, action = 'r')
        self.__name__ = 'databases.PDBeChem'    # This must be included in every class for the SBIglobals.alert()

        self._id        = None
        self._name      = None
        self._type      = None
        self._formula   = None
        self._parent    = None
        self._weight    = None
        self._fcharge   = None
        self._code1l    = None
        self._flformula = {}

        self._parse()
        self._decompose_formula()
Esempio n. 25
0
    def get_PDB(self, pdbID):
        if self.has_local:
            rootdir = os.path.join(self.local, pdbID.lower()[1:3])
            for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'):
                newfile = File(file_name=pdb_file, action='r')
                if newfile.prefix.lstrip('pdb').upper() == pdbID.upper():
                    return pdb_file

        #If we do not find it in local (or we do not have a local) we search it on the FTP
        pdb_file = 'pdb' + pdbID.lower() + '.ent.gz'
        source = 'ftp://' + PDBftp['address'] + os.path.join(
            PDBftp['structures'], pdbID[1:3].lower(), pdb_file)
        try:
            urllib.urlretrieve(source, pdb_file)
        except:
            return False
        return os.path.abspath(pdb_file)
Esempio n. 26
0
    def __init__(self, cdhit_file=None):
        '''
        @param:    cdhit_file
        @pdef:     name of the cd-hit output file
        @pdefault: _None_. Create an empty list
        @ptype:    {String}

        '''
        self._clusters = []
        self._allseqids = {}
        if cdhit_file is not None:
            self._file = File(file_name=cdhit_file)
        else:
            self._file = None

        if self._file is not None:
            self._parse_file()
Esempio n. 27
0
    def print_representation(self, line_split=160, out_file=None):
        '''
        Print the alignment representation of the blast hit.

        @param:    line_split
        @pdef:     number of characters per line
        @pdefault: 160
        @ptype:    {Integer}

        @param:    out_file
        @pdef:     file to print the blast data into.
        @pdefault: _None_
        @ptype:    {String}
        '''
        if out_file is not None:
            output = File(out_file, 'w')
            output.write("%s\n" % self.str_representation(line_split))
            output.close()
        else:
            print self.str_representation(line_split)
Esempio n. 28
0
    def get_resolutions(self):
        # resolutions (-1) are for methods that do not define resolution
        resolutions = {}

        ftp = ftplib.FTP(PDBftp['address'])
        ftp.login()
        ftp.cwd(PDBftp['derived'])
        resoluIDX = []
        ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append)
        ftp.quit()

        SBIglobals.alert('debug', self,
                         'Retrieving resolution data from PDB FTP...')

        active = False
        for line in resoluIDX:
            if line.startswith('-'):
                active = True
                continue
            if active and len(line.strip()) > 0:
                data = [x.strip() for x in line.split(';')]
                if len(data[1]) > 0:
                    SBIglobals.alert(
                        'debug', self,
                        '\tResolution for {0[0]} is {0[1]}...'.format(data))
                    # if resolution_threshold is None:
                    resolutions[data[0]] = data[1]

        #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check
        for pdb_file in self.localPDBs:
            newfile = File(file_name=pdb_file, action='r')
            pdbid = newfile.prefix.lstrip('pdb').upper()
            if pdbid not in resolutions:
                pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True)
                SBIglobals.alert(
                    'debug', self,
                    '\tGrabbing Resolution for {0} is {1}...'.format(
                        pdbid, pdbobj.header.resolution))
                resolutions[pdbid] = pdbobj.header.resolution

        return resolutions
 def _process(self):
     tmoFile = File(self._pdbtmfile, 'w', True)
     for xmlfile in Path.list_files(
             os.path.join(self._local, 'pdbtm/database/'), '*.xml'):
         xmldata = TM(
             pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper())
         skip_chains = set()
         read = False
         fdxml = open(xmlfile)
         for line in fdxml:
             if line.startswith('    <TMRES>'): xmldata.tmres = line
             elif line.startswith('    <TMTYPE'): xmldata.tmtype = line
             elif line.startswith('    <PDBKWRES'): xmldata.kwres = line
             elif line.startswith('  <SIDEDEFINITION'):
                 m = re.search('Side1="(\S+)"', line)
                 xmldata.side = m.group(1)
             elif line.startswith('      <APPLY_TO_CHAIN'):
                 m = re.search('NEW_CHAINID=\"(\S{1})\"', line)
                 if m: skip_chains.add(m.group(1))
             elif line.startswith('  <CHAIN '):
                 m = re.search(
                     'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"',
                     line)
                 if m:
                     chain, num, tmtype = m.group(1), m.group(2), m.group(3)
                     if not chain in skip_chains:
                         cdata = tuple([chain, num, tmtype])
                         xmldata.set_chain(cdata)
                         read = True
             elif line.startswith('    <REGION ') and read:
                 m = re.search(
                     'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"',
                     line)
                 ini, end, tmtype = m.group(1), m.group(2), m.group(3)
                 xmldata.set_chain(cdata, tuple([ini, end, tmtype]))
             elif line.startswith('  </CHAIN>'):
                 read = False
         fdxml.close()
         if len(xmldata.chains) > 0:
             tmoFile.write(str(xmldata) + "\n")
     tmoFile.close()
Esempio n. 30
0
    def reduce(self, new_fasta_file, list_file, force=None):
        '''
        Reduces the {Fasta} by removing identical sequences.

        @param:    new_fasta_file
        @pdef:     name of the new fasta file
        @ptype:    {String}

        @param:    list_file
        @pdef:     name of the repetition list file
        @ptype:    {String}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta} and {File} with the list of identical sequences.
        '''
        seq_md5 = {}
        sequences = []
        for seq in self.live_show():
            md5 = seq.md5
            if not md5 in seq_md5:
                sequences.append(seq)
                seq_md5.setdefault(md5, [])
            else:
                SBIg.alert(
                    'debug', self,
                    '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0]))
            seq_md5[md5].append(seq.id)
        fasta = Fasta.build_multifasta(new_fasta_file, sequences, force)
        listfile = File(list_file, 'w')
        for md5 in seq_md5:
            listfile.write('\t'.join(seq_md5[md5]) + '\n')
        listfile.close()

        return fasta, listfile