Exemple #1
0
from FASTA import *
import numpy
import pylab as P
P.ion()

nucleotides = ['G', 'A', 'T', 'C']

nucleotide_to_index = {}
for i, nuc in enumerate(nucleotides):
    nucleotide_to_index[nuc] = i

# build PSSM on yeast genome:
yeast = FASTA('s_cerevisiae.fasta')

# motif is TATAwxyzuv
motif_start = 'TATA'
motif_length = 10

pseudo_count = 1
count_pssm = numpy.zeros((motif_length, 4)) + 1

num_matches = 0
for chromosome_name, chromosome_sequence in yeast.accession_to_sequence.items(
):
    print 'processing', chromosome_name
    for i in xrange(len(chromosome_sequence) - motif_length):
        sl = chromosome_sequence[i:i + motif_length]
        if sl.startswith(motif_start):
            num_matches += 1
            for i, nuc in enumerate(sl):
                nuc_index = nucleotide_to_index[nuc]
Exemple #2
0
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


import FASTA
import sys

recs = FASTA.readFasta(sys.stdin)

maxLen = max([len(rec.sequence) for rec in recs])

titleWidth = 10
seqWidth = 60

start = 0
while start < maxLen:
    for rec in recs:
        if start == 0:
            print "%s%s" % (rec.title.ljust(titleWidth)[0:titleWidth],
                            rec.sequence[start : start + seqWidth])
        else:
            print "%s%s" % (' ' * titleWidth,
                            rec.sequence[start : start + seqWidth])
    anchors = {}
    for line in strm:
        fields = line.split()
        anchors[fields[0]] = fields[3:5]
    return anchors

# Test for existence of genscan parameter file
if not os.path.exists(options.genscanParamFile):
    sys.stderr.write("Error: Genscan parameter file %s does not exist\n" %
                     options.genscanParamFile)
    sys.exit(1)

# Read in the sequences from the multi-fasta file
sys.stderr.write("Reading in multi-fasta file...")
multiFastaFile = file(multiFastaFilename)
fastaRecs = FASTA.readFasta(multiFastaFile)
multiFastaFile.close()
sys.stderr.write("done\n")

# Make protein anchors for each sequence
for rec in fastaRecs:
    rec.title = firstWord(rec.title)
    chromFile = file(os.path.join(workdir, rec.title + ".chroms"), 'w')
    chromFile.write("%s\t%d\n" % (rec.title, len(rec.sequence)))
    chromFile.close()

    sys.stderr.write("Writing single-fasta file...")
    fastaFilename = os.path.join(workdir, rec.title + ".fa")
    fastaFile = file(fastaFilename, 'w')
    fastaFile.write(str(rec))
    fastaFile.close()