Ejemplo n.º 1
0
def FastqIterator(files,raw=False,titleSet=None):
    """return an iterator of Records found in file handle, fh.
    if records are not needed raw can be set to True, and then 
    you can get (titleStr, seqStr, qualityStr).  With raw output,
    the sequence and quality strings have the newlines still in them.
    """
    def readTotitle(fh, titleChar):
        """returns a tuple ([lines before the next title line], next tile line)
        """
        preLines = []
        while True:
            l = fh.readline()
            if l.startswith(titleChar):
                return (preLines,l)
            elif l == '':
                return preLines,None
            else:
                preLines.append(l)

    
    for fh in fileIterator(files):
        preLines,nextTitleLine =readTotitle(fh,'@')

        while nextTitleLine != None:
            seqTitle = nextTitleLine[1:].rstrip()
            preLines,nextTitleLine=readTotitle(fh,'+')
            qualTitle = nextTitleLine[1:].rstrip()
            if len(qualTitle.strip()) > 0 and seqTitle != qualTitle:
                raise FastqParseError, ("Error in parsing: @title sequence entry must be immediately "
                                        "followed by corresponding +title quality entry.")
            seqLines = preLines
            qualLines = []
            for i in range(len(seqLines)): # Quality characters should be the same length as the sequence
                qualLines.append( fh.readline() )

            preLines,nextTitleLine=readTotitle(fh,'@')

            if titleSet!= None and seqTitle not in titleSetSet:
                continue

            seqLines = map(lambda x: x.strip(), seqLines)
            qualLines = map(lambda x: x.strip(), qualLines)
            if raw:
                yield (seqTitle, ''.join(seqLines), ''.join(qualLines))
            else:
                rec=Record()
                rec.title=seqTitle
                rec.sequence=''.join(seqLines)
                rec.quality=flatten(map(lambda x: qualToInt(x),qualLines))
                yield rec
Ejemplo n.º 2
0
def phdQualIterator(fastaFiles, qualFiles, raw=False):
    def readTotitle(fh):
        """returns a tuple ([lines before the next title line], next tile line)
        """
        preLines = []
        while True:
            l = fh.readline()
            if l.startswith('>'):
                return (preLines,l)
            elif l == '':
                return preLines,None
            else:
                preLines.append(l)

    def qualityIterator(filename):
        fh = file(filename)
        preLines, nextTitleLine = readTotitle(fh)

        while nextTitleLine != None:
            title = nextTitleLine[1:].rstrip()
            preLines, nextTitleLine = readTotitle(fh)

            yield (title, ' '.join(preLines))

    qualFiles = getIteratable(qualFiles)
    for idx, fastaFh in enumerate(fileIterator(fastaFiles)):
        qualIter = qualityIterator(qualFiles[idx])

        for seqTitle, sequence in fasta.FastaIterator(fastaFh, raw=True):
            qTitle, qualities = qualIter.next()
            sequence = sequence.replace('\n', '')
            qualities = qualities.replace('\n', '')
            if raw:
                yield (seqTitle, ''.join(sequence), ''.join(qualities))
            else:
                qualities = qualities.split()
                if len(sequence) != len(qualities):
                    raise Exception, 'Invalid number of qualities'
                rec = Record()
                rec.title = seqTitle
                rec.sequence = sequence
                rec.quality = qualities
                yield rec
Ejemplo n.º 3
0
def makematrix(**kwargs):
    """
    ?
    """

    corpusfile = kwargs["corpusfile"]
    outfile = kwargs["outfile"]
    c_length = kwargs["c_length"]
    threshold = kwargs["threshold"]
    num_shards = kwargs["num_shards"]
    vocfile = kwargs["vocfile"]
    # Generate Counts of each distinct word
    print "Generating Wordcounts..."
    iterator = utils.fileIterator(corpusfile, num_shards)
    counts = defaultdict(int)
    num_sentences = 0
    vocab = set()
    for sentence in iterator:
        num_sentences += 1
        for word in sentence:
            counts[word] += 1.0
    print "Counts generated from %d sentences." % num_sentences
    print "Initializing Adj Matrix..."
    # Generate Vocabulary Mapping. Disregard uncommon words.
    vocab = {word: idx for idx, word in enumerate([word for word, count in counts.iteritems() if count > threshold])}
    vocab_size = len(vocab.keys())
    # Initialize Adj matrix
    adj = [defaultdict(int) for _ in range(vocab_size)]
    print "Matrix initialized with vocab size of {0}.".format(vocab_size)
    # Generate Contexts for every Sentence in Corpus
    iterator = utils.fileIterator(corpusfile, num_shards)
    print "Starting Matrix Computation..."
    sentence_count = 0
    for sentence in iterator:
        sentence_count += 1
        if sentence_count % (num_sentences / 100) == 0:
            sys.stderr.write("Progress: {0}%\r".format((100 * sentence_count) / num_sentences))
        # Generate Contexts for each word in sentence
        for pos in range(0, len(sentence)):
            word = sentence[pos]
            # Disregard uncommon Words
            if word not in vocab:
                continue
            word = vocab[word]
            # For each context
            for offset in (-c_length, c_length):
                # Out Of Range
                if offset == 0 or pos + offset < 0 or pos + offset >= len(sentence):
                    continue
                context = sentence[pos + offset]
                # Disregard Uncommon Contexts
                if context not in vocab:
                    continue
                context = vocab[context]
                adj[word][context] += 1.0
    # Normalize Context Counts
    for word in range(0, vocab_size):
        total_counts = sum(adj[word].values())
        adj[word] = {context: count / total_counts for context, count in adj[word].iteritems()}
    pickle.dump(adj, open(outfile, "w"))
    pickle.dump(vocab, open(vocfile, "w"))
    return