parser = argparse.ArgumentParser( description=('Print a (Levenshtein) distance matrix for a set of ' 'known adaptors')) parser.add_argument( 'adaptors', nargs='+', metavar='adaptor', help='the set of adaptors that were used in sequencing') args = parser.parse_args() adaptors = args.adaptors nAdaptors = len(adaptors) length = len(adaptors[0]) spaces = ' ' * length for i in range(length): print(spaces, end=' ') for adaptor in adaptors: print(adaptor[i], end=' ') print() for i in range(nAdaptors): print(adaptors[i], end=' ') for j in range(nAdaptors): if j < i: print(' ', end=' ') else: print(levenshtein(adaptors[i], adaptors[j]), end=' ') print()
def splitFASTAByAdaptor(knownAdaptors, adaptorLen, adaptorOffset, maximumDistance, outputPrefix, dryRun, verbose): """ @param knownAdaptors: A C{set} of expected adaptor sequences. @param adaptorLen: The C{int} length of each adaptor sequence. @param adaptorOffset: The zero-based C{int} offset of the adaptor in each sequence. @param maximumDistance: The maximum distance an unknown adaptor will be mapped to in an attempt to find its nearest known adaptor. @param outputPrefix: A C{str} prefix that should be used in the file names that are written out. @param dryRun: A C{bool}, if C{True} only print what would be done, don't create any new FASTA files. @param verbose: A C{bool}, if C{True} output additional information about adaptor classes found and assigned. """ adaptors = defaultdict(int) unknowns = 0 classes = dict(zip(knownAdaptors, knownAdaptors)) reads = [] for count, seq in enumerate(SeqIO.parse(sys.stdin, 'fasta'), start=1): reads.append(seq) adaptor = str(seq.seq)[adaptorOffset:][:adaptorLen].upper() adaptors[adaptor] += 1 order = sorted(adaptors, key=lambda adaptor: adaptors[adaptor], reverse=True) for adaptor in order: if adaptor in knownAdaptors: if verbose: print('%s: %s. Known adaptor' % (adaptor, adaptors[adaptor])) else: distances = sorted((levenshtein(adaptor, known), known) for known in knownAdaptors) # Treat the read as unclassifiable if it's too far from its # nearest neighbor or if its nearest neighbor is ambiguous. nearest = distances[0][0] if nearest > maximumDistance or (len(knownAdaptors) > 1 and nearest == distances[1][0]): unknowns += 1 classes[adaptor] = UNKNOWN if verbose: print( '%s: %s. Unknown, distances %r' % (adaptor, adaptors[adaptor], [d[0] for d in distances])) else: correctedAdaptor = distances[0][1] classes[adaptor] = correctedAdaptor if verbose: print('%s: %s. Assigned to class %s, at dist %d' % (adaptor, adaptors[adaptor], correctedAdaptor, distances[0][0])) readGroups = defaultdict(list) # Collect reads into classes. for read in reads: adaptor = str(read.seq)[adaptorOffset:][:adaptorLen].upper() readGroups[classes[adaptor]].append(read[adaptorOffset + adaptorLen:]) # Calculate the number of digits in the size of the biggest read group # so we can nicely align the output. width = int(ceil(log10(max(len(group) for group in readGroups.values())))) # The width of the count of files we'll write, so file names have zero # padded numeric prefixes. filesWidth = int(ceil(log10(len(readGroups)))) # Write out the FASTA files for each adaptor class (this includes the # unclassifiable reads if any unknown adaptors were found). for count, adaptor in enumerate(sorted(readGroups), start=1): reads = readGroups[adaptor] filename = '%s%0*d-%s.fasta' % (outputPrefix, filesWidth, count, adaptor) description = ('unrecognized adaptors' if adaptor == UNKNOWN else 'adaptor %s' % adaptor) if dryRun: print('Would write %*d sequences for %s to %s' % (width, len(reads), description, filename)) else: with open(filename, 'w') as fp: SeqIO.write(reads, fp, 'fasta') print('Wrote %*d sequences for %s to %s' % (width, len(reads), description, filename))
def splitFASTAByAdaptor(knownAdaptors, adaptorLen, adaptorOffset, maximumDistance, outputPrefix, dryRun, verbose): """ @param knownAdaptors: A C{set} of expected adaptor sequences. @param adaptorLen: The C{int} length of each adaptor sequence. @param adaptorOffset: The zero-based C{int} offset of the adaptor in each sequence. @param maximumDistance: The maximum distance an unknown adaptor will be mapped to in an attempt to find its nearest known adaptor. @param outputPrefix: A C{str} prefix that should be used in the file names that are written out. @param dryRun: A C{bool}, if C{True} only print what would be done, don't create any new FASTA files. @param verbose: A C{bool}, if C{True} output additional information about adaptor classes found and assigned. """ adaptors = defaultdict(int) unknowns = 0 classes = dict(zip(knownAdaptors, knownAdaptors)) reads = [] for count, seq in enumerate(SeqIO.parse(sys.stdin, 'fasta'), start=1): reads.append(seq) adaptor = str(seq.seq)[adaptorOffset:][:adaptorLen].upper() adaptors[adaptor] += 1 order = sorted(adaptors, key=lambda adaptor: adaptors[adaptor], reverse=True) for adaptor in order: if adaptor in knownAdaptors: if verbose: print('%s: %s. Known adaptor' % (adaptor, adaptors[adaptor])) else: distances = sorted((levenshtein(adaptor, known), known) for known in knownAdaptors) # Treat the read as unclassifiable if it's too far from its # nearest neighbor or if its nearest neighbor is ambiguous. nearest = distances[0][0] if nearest > maximumDistance or (len(knownAdaptors) > 1 and nearest == distances[1][0]): unknowns += 1 classes[adaptor] = UNKNOWN if verbose: print('%s: %s. Unknown, distances %r' % ( adaptor, adaptors[adaptor], [d[0] for d in distances])) else: correctedAdaptor = distances[0][1] classes[adaptor] = correctedAdaptor if verbose: print('%s: %s. Assigned to class %s, at dist %d' % ( adaptor, adaptors[adaptor], correctedAdaptor, distances[0][0])) readGroups = defaultdict(list) # Collect reads into classes. for read in reads: adaptor = str(read.seq)[adaptorOffset:][:adaptorLen].upper() readGroups[classes[adaptor]].append(read[adaptorOffset + adaptorLen:]) # Calculate the number of digits in the size of the biggest read group # so we can nicely align the output. width = int(ceil(log10(max(len(group) for group in readGroups.values())))) # The width of the count of files we'll write, so file names have zero # padded numeric prefixes. filesWidth = int(ceil(log10(len(readGroups)))) # Write out the FASTA files for each adaptor class (this includes the # unclassifiable reads if any unknown adaptors were found). for count, adaptor in enumerate(sorted(readGroups), start=1): reads = readGroups[adaptor] filename = '%s%0*d-%s.fasta' % (outputPrefix, filesWidth, count, adaptor) description = ('unrecognized adaptors' if adaptor == UNKNOWN else 'adaptor %s' % adaptor) if dryRun: print('Would write %*d sequences for %s to %s' % ( width, len(reads), description, filename)) else: with open(filename, 'w') as fp: SeqIO.write(reads, fp, 'fasta') print('Wrote %*d sequences for %s to %s' % ( width, len(reads), description, filename))
def testIdentical(self): """ Two identical strings must have distance zero. """ self.assertEqual(0, levenshtein('BLAH', 'BLAH'))
parser = argparse.ArgumentParser( description=('Print a (Levenshtein) distance matrix for a set of ' 'known adaptors')) parser.add_argument( 'adaptors', type=str, nargs='+', metavar='adaptor', help='the set of adaptors that were used in sequencing') args = parser.parse_args() adaptors = args.adaptors nAdaptors = len(adaptors) length = len(adaptors[0]) spaces = ' ' * length for i in xrange(length): print spaces, for adaptor in adaptors: print adaptor[i], print for i in xrange(nAdaptors): print adaptors[i], for j in xrange(nAdaptors): if j < i: print ' ', else: print levenshtein(adaptors[i], adaptors[j]), print
def testInsert(self): """ Test a string insertion that results in a distance of 2. """ self.assertEqual(2, levenshtein('AGTACACACTG', 'ACGTACACACT'))
def testMutation(self): """ Test a single character results in a distance of 1. """ self.assertEqual(1, levenshtein('ACGTACACACG', 'ACGTACACACT'))