Exemple #1
0
 def saveNexus0(self, oFileHandle, datatype='protein', gap='-', interleave=False, width=60):
     assert datatype.lower() in ['dna', 'rna', 'protein', 'standard', 'restriction']
     yesno = {True: 'yes', False: 'no'}
     oFile = smartopen(oFileHandle, 'w')
     print >> oFile, '#NEXUS'
     print >> oFile, 'BEGIN data;'
     print >> oFile, 'DIMENSIONS ntax=%i nchar=%i;' % (self.numberOfSeqs(), self.__len__())
     print >> oFile, 'FORMAT datatype=%s interleave=%s gap=%s;' \
         % (datatype, yesno[interleave], gap)
     print >> oFile
     
     print >> oFile, 'MATRIX'
     format = '%%-%is %%s' % max([len(name) for name in self.seqDict])
     if interleave:
         for i in xrange(0,self.__len__(),width):
             for name in self.order:
                 print >> oFile, format % (name, self.seqDict[name][i:i+width])
             print >> oFile
     else:
         for name in self.order:
             print >> oFile, format % (name, self.seqDict[name])
         print >> oFile
     print >> oFile, ';'
     
     print >> oFile, 'END;'
     oFile.close()
Exemple #2
0
 def loadPhylip(iFileHandle, multipleDatasets=False):
     """
     Load phylip alignment data.
     
     @param iFileHandle: Input filename or file.
     @param multipleDatasets: Do not close file (default False)
     """
     iFile = smartopen(iFileHandle)
     nSeq,L = [int(x) for x in iFile.readline().strip().split()]
     
     alignment = Alignment()
     for i in xrange(nSeq):
         line = iFile.readline()
         tokens = line.strip().split()
         seq = ''.join(tokens[1:])
         alignment.append(tokens[0], seq)
     skip = iFile.readline()
     
     width = len(alignment)
     nBlocks = int(math.ceil(float(L)/width))
     for j in xrange(nBlocks-1):
         for i in xrange(nSeq):
             line = iFile.readline().strip()
             name = alignment.getName(i)
             alignment.append(name, line, cleanup=True)
         
         try:
             skip = iFile.readline()
         except:
             pass
     
     if not multipleDatasets: iFile.close()
     
     return alignment
Exemple #3
0
def load(iFilename, offset=0):
    """Load a BED file.

    @param iFilename: Input filename or file.
    @param offset: Offset subtracted from positions (Default: 0).
    @return: List of features.
    """
    iFile = smartopen(iFilename)
    data = []
    for line in iFile:
        line = line.strip()
        if line and line[0] != "#":
            tokens = line.split("\t")
            f = Feature(tokens)

            try:
                f.chromStart -= offset
                f.chromEnd -= offset
                f.thickStart -= offset
                f.thickEnd -= offset
            except:
                pass

            data.append(f)
    return data
Exemple #4
0
 def saveMolphy(self, oFileHandle, width=60):
     oFile = smartopen(oFileHandle, 'w')
     print >> oFile, '%i %i' % (self.numberOfSeqs(), self.__len__())
     
     for name in self.order:
         print >> oFile, name
         for i in xrange(0, self.__len__(), width):
             print >> oFile, self.seqDict[name][i:i+width]
     oFile.close()
Exemple #5
0
def load2(iFileHandle):
    iFile = smartopen(iFileHandle)
    genes = []
    for line in iFile:
        tokens = line.strip().split()
        if not line or line[0]=='#':
            continue
        elif line[0] in ['<', '>']:
            genes.append(Gene.fromTokens(tokens))
        else:
            genes[-1].add(Exon.fromTokens(tokens))
 def __init__(self, fileHandle, mode='w', width=60, blockSize=None, **kw):
     """
     @param iFileHandle: Output file or name
     @keyword mode: File mode - write(w) or append(a)
     """
     assert mode in ('w', 'a')
     self.iFile = smartopen(fileHandle, mode)
     self.iFilename = self.iFile.name
     
     if kw: 
         print 'Uncaptured keywords'
         print kw
Exemple #7
0
def load(iFileHandle):
    """Load WIG file.
    
    @param iFileHandle: Input file or filename
    @return: (header, data)
    """
    iFile = smartopen(iFileHandle)
    header = iFile.readline()
    data = []
    for line in iFile:
        score = float(line.strip())
        data.append(score)
    return header,data
Exemple #8
0
def load_iter(iFileHandle, skip=5, splitOn=None):
    """Load TRANSFAC match output.
    
    Arguments:
    iFileHandle -- Input file or filename.
    
    """
    iFile = smartopen(iFileHandle)
    for i in xrange(skip):
        iFile.next()
    
    for line in iFile:
        tokens = line.strip().split(splitOn)
        yield Match(tokens)
Exemple #9
0
def load_iter(iFileHandle):
    iFile = smartopen(iFileHandle)
    first = True
    for line in iFile:
        tokens = line.strip().split()
        if not line or line[0]=='#':
            continue
        elif line[0] in ['<', '>']:
            if not first: 
                yield gene
            else:
                first = False
            gene = Gene.fromTokens(tokens)
        else:
            gene.add(Exon.fromTokens(tokens))
    yield gene
Exemple #10
0
 def savePhylip(self, oFileHandle, width=10, **kw):
     oFile = smartopen(oFileHandle, 'w')
     print >> oFile, '%7i%7i' % (self.numberOfSeqs(), self.__len__())
     
     L = self.__len__()
     format = '%%-%is %%s' % width
     for i in xrange(0,L,50):
         for name in self.order:
             if i==0:
                 label = name[0:width]
             else:
                 label = ''
             
             print >> oFile, ('%%-%is' % width) % label,
             for j in xrange(0,50,10):
                 print >> oFile, self.seqDict[name][i+j:i+j+10],
             print >> oFile
         print >> oFile
     oFile.close()
Exemple #11
0
 def __init__(self, iFileHandle, force=False, section=None, **kw):
     """Constructor
     
     @param iFileHandle: CAP3 file name or object
     """
     self.iFile = smartopen(iFileHandle)
     self.iFilename = self.iFile.name
     
     self.indexFile = CapIndexFile(self.iFilename)
     self.indexFile.build(force=force)
     self.stopAtMiddle = False
     if section==1:
         self.stopAtMiddle = True
     elif section==2:
         self.seek(0,2)
     
     self._iter = None
     self._initIter = True
     self._section = section
Exemple #12
0
def load_iter(iFileHandle, format='psl', **kw):
    """Return an iterator to the BLAT file.
    
    @param iFileHandle: Input filename or file.
    @param format: BLAT output format (optional; default: 'psl')
    """
    if not format in ['psl']:
        raise 'Only psl is currently supported.'
    
    iFile = smartopen(iFileHandle)
    
    skip = kw.pop('skip', 5)
    for i in xrange(skip):
        junk = iFile.readline()
    
    for line in iFile:
        if line:
            tokens = line.strip().split('\t')
            yield Chain(tokens, **kw)
Exemple #13
0
 def saveNexus(self, oFileHandle, datatype='protein', gap='-', interleave=False, width=60):
     assert datatype.lower() in ['dna', 'rna', 'protein', 'standard', 'restriction']
     yesno = {True: 'yes', False: 'no'}
     oFile = smartopen(oFileHandle, 'w')
     print >> oFile, '#nexus'
     print >> oFile
     print >> oFile, 'BEGIN Taxa;'
     print >> oFile, 'DIMENSIONS ntax=%i;' % self.numberOfSeqs()
     print >> oFile, 'TAXLABELS'
     for i,name in enumerate(self.order):
         print >> oFile, "[%i] '%s'" % (i+1,name)
     print >> oFile, ';'
     print >> oFile, 'END; [Taxa]'
     print >> oFile
     
     print >> oFile, 'BEGIN Characters;'
     print >> oFile, 'DIMENSIONS nchar=%i;' % self.__len__()
     print >> oFile, 'FORMAT'
     print >> oFile, '        datatype=%s' % datatype
     print >> oFile, '        missing=?'
     print >> oFile, '        gap=%s' % gap
     print >> oFile, '        symbols="a r n d c q e g h i l k m f p s t w y v z"'
     print >> oFile, '        labels=left'
     print >> oFile, '        transpose=no'
     print >> oFile, '        interleave=%s' % yesno[interleave]
     print >> oFile, ';'
     
     print >> oFile, 'MATRIX'
     format = "%%-%is %%s" % max([len(name) for name in self.seqDict])
     if interleave:
         for i in xrange(0,self.__len__(),width):
             for name in self.order:
                 print >> oFile, format % (name, self.seqDict[name][i:i+width])
             print >> oFile
     else:
         for name in self.order:
             print >> oFile, format % (name, self.seqDict[name])
         print >> oFile
     print >> oFile, ';'
     
     print >> oFile, 'END;'
     oFile.close()
Exemple #14
0
 def loadClustal(iFileHandle, headerCheck=True):
     """
     Load clustal alignment data.
 
     @param iFileHandle: Input filename or file.
     @param headerCheck: Test that CLUSTAL appears on first line (default True)
     """
     iFile = smartopen(iFileHandle)
     clustalHeader = iFile.readline().strip()
     if headerCheck and not clustalHeader.split()[0] in ['CLUSTAL', 'MUSCLE']:
         raise 'Not a CLUSTAL file'
 
     alignment = Alignment()
     for line in iFile:
         if line[0]!=' ':
             tokens = line.strip().split()
             if len(tokens)==2:
                 alignment.append(tokens[0], tokens[1])
     iFile.close()
     return alignment
Exemple #15
0
 def saveClustal(self, oFileHandle, nameWidth=None, width=60, interleaved=True, **kw):
     oFile = smartopen(oFileHandle, 'w')
     print >> oFile, 'CLUSTAL W (1.83) multiple sequence alignment\n\n'
     
     if not nameWidth:
         nameWidth = max([len(name) for name in self.order])
     format = '%%-%is %%s' % nameWidth
     
     L = self.__len__()
     if interleaved:
         for i in xrange(0,L,width):
             for name in self.order:
                 print >> oFile, format % (name[0:nameWidth], self.seqDict[name][i:i+width])
             print >> oFile, format % (' '*nameWidth, ' '*width)
             print >> oFile
     else:
         for name in self.order:
             print >> oFile, format % (name[0:nameWidth], self.seqDict[name])
         print >> oFile, format % (' '*nameWidth, ' '*len(self.seqDict[name]))
         print >> oFile
     oFile.close()
Exemple #16
0
 def __init__(self, iFileHandle, clobber=False, 
     interface=Interface.CONTAINER, method=IndexMethod.SQLITE, **kw):
     """
     @param iFileHandle: Fasta file name or object
     """
     self.iFile = smartopen(iFileHandle)
     self.iFilename = self.iFile.name
     if method==IndexMethod.PICKLE:
         self.indexFile = FastaIndexPickleFile(self.iFilename)
     elif method==IndexMethod.TEXT:
         self.indexFile = FastaIndexTextFile(self.iFilename)
     else: # sqlite3 method is the default
         self.indexFile = FastaIndexFile(self.iFilename)
     
     self.indexFile.build(clobber=clobber)
     self.interface = interface
     self._iter = None
     self._initIter = True
     
     if kw: 
         print 'Uncaptured keywords'
         print kw
Exemple #17
0
    def loadStockholm(iFileHandle, **kw):
        """Load Stockholm alignment data.
        
        @param iFileHandle: Input filename or file.
        @returns: a dictionary of sequences {name1: seq1, name2: seq2, ...}
        """
        iFile = smartopen(iFileHandle)
        alignment = Alignment()
        alignment.headers = {}

        # Skip hmmalign header
        start = False
        for line in iFile:
            line = line.strip()
            if line == '# STOCKHOLM 1.0':
                break

        # Parse sto header info
        for line in iFile:
            line = line.strip()
            if not line:  # Blank lines
                continue
            elif line[0:4] == '#=GS':  # Fasta headers
                header = line.strip()[1:]
                tokens = header.split()
                alignment.headers[tokens[0]] = header
                continue
            elif line[0] == '#':  # Other boring comment lines
                continue
            elif line == '//':  # End of file
                break

            # The real stuff
            name, seq = line.split()
            alignment.append(name, seq)
        return alignment
Exemple #18
0
def load_preprocessed(iFileHandle):
    """Load genscan predictions when predictions have been preprocessed
    and only contain the gene prediction lines
    
    Arguments:
    iFileHandle -- Input file or filename.
    
    Return values:
    data -- Annotation data (a list of lists, each list in one gene)
    
    """
    iFile = smartopen(iFileHandle)
    data = {}
    
    skipState = 'Slice no. '
    
    state = None
    for line in iFile:
        line = line.strip()
        if line:
            if skipState in line:
                pass
            else:
                tokens = line.split()
                d = Predicted(tokens)
                gene = int(d.gene_exon.split('.')[0])
                try:
                    data[gene].append(d)
                except KeyError:
                    data[gene] = [d]
    
    data = data.items()
    data.sort()
    data = [x[1] for x in data]
    
    return data
Exemple #19
0
 def loadStockholm(iFileHandle, **kw):
     """Load Stockholm alignment data.
     
     @param iFileHandle: Input filename or file.
     @returns: a dictionary of sequences {name1: seq1, name2: seq2, ...}
     """
     iFile = smartopen(iFileHandle)
     alignment = Alignment()
     alignment.headers = {}
     
     # Skip hmmalign header
     start = False
     for line in iFile:
         line = line.strip()
         if line=='# STOCKHOLM 1.0':
             break
     
     # Parse sto header info
     for line in iFile:
         line = line.strip()
         if not line: # Blank lines
             continue
         elif line[0:4]=='#=GS': # Fasta headers
             header = line.strip()[1:]
             tokens = header.split()
             alignment.headers[tokens[0]] = header
             continue
         elif line[0]=='#': # Other boring comment lines
             continue
         elif line=='//': # End of file
             break
         
         # The real stuff
         name,seq = line.split()
         alignment.append(name, seq)
     return alignment
Exemple #20
0
 def __init__(self, iFileHandle1, iFileHandle2):
     self.iFile1 = smartopen(iFileHandle1)
     self.iFile2 = smartopen(iFileHandle2)
Exemple #21
0
def load_full(iFileHandle):
    """Load genscan predictions.
    
    Arguments:
    iFileHandle -- Input file or filename.
    
    Return values:
    data -- Annotation data (a list of lists, each list in one gene)
    proteins -- Predicted proteins (a list of tuples (header, sequence))
    meta -- Meta-data in first 8 lines of genscan output
    
    """
    iFile = smartopen(iFileHandle)
    data = {}
    proteins = []
    meta = []

    startPredState = '----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------'
    endPredState = 'Predicted peptide sequence(s):'
    skipState = 'Slice no. '
    metaState = 'GENSCAN 1.0'

    state = None
    for line in iFile:
        line = line.strip()

        if metaState in line:
            state = 'meta'
        if line == startPredState:
            state = 'pred'
        elif line == 'NO EXONS/GENES PREDICTED IN SEQUENCE':
            state = 'fail'
        elif line == endPredState:
            state = 'prot'
        elif skipState in line:
            state = 'skip'
        else:
            if state == 'meta':
                if line:
                    meta.append(line)
            elif state == 'pred':
                if line:
                    tokens = line.split()
                    d = Predicted(tokens)
                    gene = int(d.gene_exon.split('.')[0])
                    try:
                        data[gene].append(d)
                    except KeyError:
                        data[gene] = [d]
            elif state == 'prot':
                break
            elif state == 'fail':
                return [], [], ''

    if state == 'prot':
        proteins = fasta.load_mfa(iFile)

    data = data.items()
    data.sort()
    data = [x[1] for x in data]

    return data, proteins, meta
Exemple #22
0
    def load(iFileHandle):
        iFile = smartopen(iFileHandle)
        aln = Alignment()
        state = None
        for line in iFile:
            line = line.rstrip()
            if not line:
                continue
            elif line == '#:lav':  # Section break
                state = 'BLOCK'
                block = Block()
                continue
            elif line == '#:eof':  # End of file
                state = 'EOF'
                break
            elif line[0] == 'd':  # Substitution matrix stanza
                state = 'MATRIX'
                continue
            elif line[0] == 's':  # Sequence files stanza
                state = 'FILES'
                continue
            elif line[0] == 'h':  # Fasta headers stanza
                state = 'HEADERS'
                continue
            elif line[0] == 'a':  # Alignment stanza
                state = 'ALIGN'
                chain = Chain.fromBlock(copy.copy(block))
                continue
            elif line[0] in ['x', 'm']:
                state = 'BORING'
                continue
            elif state == 'MATRIX' and line[0] == '}':
                state = 'MATRIX_END'
                aln.matrix = '\n'.join(aln.matrix)
                continue
            elif state == 'ALIGN' and line[0] == '}':
                state = 'ALIGN_END'
                aln.chains.append(chain)
                chain = None
                continue
            elif line[0] == '}':  # End of state
                continue

            tokens = line.lstrip().split()
            if state == None:
                print line
                raise Exception('Wrong')
            elif state == 'MATRIX':
                aln.matrix.append(line)
            elif state == 'FILES':
                block.filenames.append(tokens[0])
                block.lengths.append(int(tokens[2]))
                block.strands.append(strandDict[tokens[3]])
                # Next line in stanza
                tokens = iFile.next().strip().split()
                block.filenames.append(tokens[0])
                block.lengths.append(int(tokens[2]))
                block.strands.append(strandDict[tokens[3]])
            elif state == 'HEADERS':
                block.headers.append(tokens[0])
                # Next line in stanza
                tokens = iFile.next().strip().split()
                block.headers.append(tokens[0])
            elif state == 'ALIGN':
                if tokens[0] == 's':
                    chain.score = int(tokens[1])
                elif tokens[0] == 'b':
                    chain.interval1[0] = int(tokens[1])
                    chain.interval2[0] = int(tokens[2])
                elif tokens[0] == 'e':
                    chain.interval1[1] = int(tokens[1])
                    chain.interval2[1] = int(tokens[2])
                elif tokens[0] == 'l':
                    chain.hsps.append(HSP(tokens[1:]))
        return aln
Exemple #23
0
 def load(iFileHandle):
     iFile = smartopen(iFileHandle)
     aln = Alignment()
     state = None
     for line in iFile:
         line = line.rstrip()
         if not line:
             continue
         elif line=='#:lav': # Section break
             state = 'BLOCK'
             block = Block()
             continue
         elif line=='#:eof': # End of file
             state = 'EOF'
             break
         elif line[0]=='d': # Substitution matrix stanza
             state = 'MATRIX'
             continue
         elif line[0]=='s': # Sequence files stanza
             state = 'FILES'
             continue
         elif line[0]=='h': # Fasta headers stanza
             state = 'HEADERS'
             continue
         elif line[0]=='a': # Alignment stanza
             state = 'ALIGN'
             chain = Chain.fromBlock(copy.copy(block))
             continue
         elif line[0] in ['x', 'm']:
             state = 'BORING'
             continue
         elif state=='MATRIX' and line[0]=='}':
             state = 'MATRIX_END'
             aln.matrix = '\n'.join(aln.matrix)
             continue
         elif state=='ALIGN' and line[0]=='}':
             state = 'ALIGN_END'
             aln.chains.append(chain)
             chain = None
             continue
         elif line[0]=='}': # End of state
             continue
         
         tokens = line.lstrip().split()
         if state==None:
             print line
             raise Exception('Wrong')
         elif state=='MATRIX':
             aln.matrix.append(line)
         elif state=='FILES':
             block.filenames.append(tokens[0])
             block.lengths.append(int(tokens[2]))
             block.strands.append(strandDict[tokens[3]])
             # Next line in stanza
             tokens = iFile.next().strip().split()
             block.filenames.append(tokens[0])
             block.lengths.append(int(tokens[2]))
             block.strands.append(strandDict[tokens[3]])
         elif state=='HEADERS':
             block.headers.append(tokens[0])
             # Next line in stanza
             tokens = iFile.next().strip().split()
             block.headers.append(tokens[0])
         elif state=='ALIGN':
             if tokens[0]=='s':
                 chain.score = int(tokens[1])
             elif tokens[0]=='b':
                 chain.interval1[0] = int(tokens[1])
                 chain.interval2[0] = int(tokens[2])
             elif tokens[0]=='e':
                 chain.interval1[1] = int(tokens[1])
                 chain.interval2[1] = int(tokens[2])
             elif tokens[0]=='l':
                 chain.hsps.append(HSP(tokens[1:]))
     return aln
Exemple #24
0
 def __init__(self, iFileHandle1, iFileHandle2):
     self.iFile1 = smartopen(iFileHandle1)
     self.iFile2 = smartopen(iFileHandle2)
Exemple #25
0
        ('-p', 'Predicted peptides only'),
        ('-a', email),
    ]
    files = [('-u', '', ''), ('-v', '', '')]
    
    try:
        html = multipart.post(genomeScanURL, fields, files, proxy, proxyPort)
    except multipart.FormSubmissionException:
        print >> sys.stderr, "*** %s submission failed. Retry later" % description
        return
    except Exception, e:
        print e
        sys.exit('Argh!')
    
    if oFileHandle:
        oFile = smartopen(oFileHandle, 'w')
        print >> oFile, html
        oFile.close()
    
    return html


def extractSeq(feature, blastDb, dx, dy):
    """Extract the translated sequence of a feature and DNA 
    sequence of the surrounding the genomic region.
    
    @param feature: Feature object. Mandatory attributes: accession, sStart, sEnd.
    @param blastDb: Blast database.
    @param dx: Length of sequence to extract upstream.
    @param dy: Length of sequence to extract downstream.
    @returns: tuple of fasta strings (DNA, protein).
Exemple #26
0
 def __init__(self, iFileHandle):
     self.iFile = smartopen(iFileHandle)
     self.iFilename = self.iFile.name
     self._iter = None
Exemple #27
0
def load_full(iFileHandle):
    """Load genscan predictions.
    
    Arguments:
    iFileHandle -- Input file or filename.
    
    Return values:
    data -- Annotation data (a list of lists, each list in one gene)
    proteins -- Predicted proteins (a list of tuples (header, sequence))
    meta -- Meta-data in first 8 lines of genscan output
    
    """
    iFile = smartopen(iFileHandle)
    data = {}
    proteins = []
    meta = []
    
    startPredState = '----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------'
    endPredState = 'Predicted peptide sequence(s):'
    skipState = 'Slice no. '
    metaState = 'GENSCAN 1.0'
    
    state = None
    for line in iFile:
        line = line.strip()
        
        if metaState in line:
            state = 'meta'
        if line==startPredState:
            state = 'pred'
        elif line=='NO EXONS/GENES PREDICTED IN SEQUENCE':
            state = 'fail'
        elif line==endPredState:
            state = 'prot'
        elif skipState in line:
            state = 'skip'
        else:
            if state=='meta':
                if line:
                    meta.append(line)
            elif state=='pred':
                if line:
                    tokens = line.split()
                    d = Predicted(tokens)
                    gene = int(d.gene_exon.split('.')[0])
                    try:
                        data[gene].append(d)
                    except KeyError:
                        data[gene] = [d]
            elif state=='prot':
                break
            elif state=='fail':
                return [], [], ''
    
    if state=='prot':
        proteins = fasta.load_mfa(iFile)
    
    data = data.items()
    data.sort()
    data = [x[1] for x in data]
    
    return data, proteins, meta