def build_distance_matrix(self, fname) : tmp = [] fq = FastqFile(fname) fq.open() for seq in fq : tmp.append((seq.id[1:], seq.sequence)) fq.close() p = Progress("Calculating distance matrix", (len(tmp)*(len(tmp)-1)) / 2.0, False) p.start() dist = {} for index, labelseq in enumerate(tmp) : label1,seq1 = labelseq dist[(label1,label1)] = 1.0 for label2,seq2 in tmp[index+1:] : dist[(label1,label2)] = \ dist[(label2,label1)] = \ self.distance([seq1,seq2]) p.increment() p.end() return dist, [ label for label, seq in tmp ]
def create_clusters(self, keys=None, homopolymer_correction=True) : seqcount = collections.Counter() if keys == None : keys = self.db for key in keys : seqcount[key] = self.db.get(key).duplicates p = Progress("Clustering", len(seqcount)) p.start() for key,freq in seqcount.most_common() : clustered = False seq = self.db.get(key) for c in self.clusters : if self.alignment_similarity(self.db.get(c[0]), seq, homopolymer_correction) >= self.similarity_threshold : c.append(key) clustered = True break if not clustered : self.clusters.append([key]) self.clusters.sort(key=len, reverse=True) p.increment() p.end()
def preprocess(self) : if len(self.options['input-files']) == 0 : self.log.error("nothing to do") sys.exit(1) self.seqdb = SequenceDB(preprocessed=False) p = Progress("Preprocessing", len(self.options['input-files'])) p.start() samples = [] for f in self.__get_files(self.options['input-files']) : mid = self.__mid_fastq(f) sample = Sample(f, self.options['outdir'], self.seqdb, self.__filters(mid), chimeras=self.options['chimeras']) sample.print_sample() samples.append(sample) p.increment() p.end() rejected_reads = sum([ sum(s.filters.counts) for s in samples ]) accepted_reads = sum([ len(s) for s in samples ]) unique_seq = sum([ len(s.seqcounts) for s in samples ]) print "processed %s reads, accepted %d (of which %d are unique)" % \ (rejected_reads + accepted_reads, accepted_reads, unique_seq) if samples : summary_data = self.__build_summary(samples) summary_file = Summary(self.options['summary-file']) summary_file.update(summary_data) summary_file.write(self.options['summary-file']) if self.options['verbose'] : self.summary() return 0
def read_nematodes(self, fastq_fname, fprimer, rprimer, diffs, length) : tmp = [] acc2name = {} # read in sequences fq = FastqFile(fastq_fname) fq.open() for seq in fq : if 'Nematoda' not in seq.id : continue seq.ungap() seq.back_translate() new_id = seq.id.split()[0][1:] tmp.append((new_id, seq.sequence)) acc2name[new_id] = seq.id[seq.id.find('Nematoda'):] fq.close() # test sequences p = Progress("Looking for primer sequences", len(tmp), False) p.start() tmp2 = [] for label,seq in tmp : findex = IUPAC.seq_position(fprimer, seq, diffs) if findex != -1 : #if IUPAC.seq_position_reverse(rprimer, seq, diffs) != -1 : shortseq = seq[findex + len(fprimer) : findex + len(fprimer) + length] if 'N' not in shortseq : tmp2.append((label, shortseq)) p.increment() p.end() return tmp2,acc2name
def assign_scores(self, dist, keys, threshold, tree) : scores = [] p = Progress("Calculating scores", len(keys), False) p.start() for k1 in keys : tmp = [] for k2 in keys : if dist[(k1,k2)] >= threshold : tmp.append(k2) p.increment() #pdscore = self.get_phylogenetic_metric(tmp, tree) scores.append((k1, 1.0, tmp)) #scores.append((k1, pdscore, tmp)) #scores.append((k1, 1 / float(len(tmp)), tmp)) p.end() return scores
if re.match(".+\.\d+", desc[3]) : names[name].append((desc[3], fields[2])) except IndexError : self.log.warn("could not split line from blastn: %s" % str(fields)) continue p = Progress("OTU naming (%s)" % method, sum([len(i) for i in names.values()]) if method == 'taxonomy' else len(names)) p.start() # now generate labels if method == 'blast' : for name in names : tmp = names[name][0] names[name] = "%s_%s" % (self.__get_desc_wrapper(tmp[0], method, 10), tmp[1]) p.increment() elif method == 'taxonomy' : for name in names : names[name] = "%s" % (self.__get_desc_common(names[name], method, p)) elif method == 'blastlocal' : for name in names : names[name] = "%s" % (self.__merge_taxonomy(names[name])) p.increment() p.end() return names class PyroDist(ExternalProgram) : def __init__(self) : super(PyroDist, self).__init__('PyroDist')