def build_distance_matrix(self, fname) : tmp = [] fq = FastqFile(fname) fq.open() for seq in fq : tmp.append((seq.id[1:], seq.sequence)) fq.close() p = Progress("Calculating distance matrix", (len(tmp)*(len(tmp)-1)) / 2.0, False) p.start() dist = {} for index, labelseq in enumerate(tmp) : label1,seq1 = labelseq dist[(label1,label1)] = 1.0 for label2,seq2 in tmp[index+1:] : dist[(label1,label2)] = \ dist[(label2,label1)] = \ self.distance([seq1,seq2]) p.increment() p.end() return dist, [ label for label, seq in tmp ]
def create_clusters(self, keys=None, homopolymer_correction=True) : seqcount = collections.Counter() if keys == None : keys = self.db for key in keys : seqcount[key] = self.db.get(key).duplicates p = Progress("Clustering", len(seqcount)) p.start() for key,freq in seqcount.most_common() : clustered = False seq = self.db.get(key) for c in self.clusters : if self.alignment_similarity(self.db.get(c[0]), seq, homopolymer_correction) >= self.similarity_threshold : c.append(key) clustered = True break if not clustered : self.clusters.append([key]) self.clusters.sort(key=len, reverse=True) p.increment() p.end()
def preprocess(self) : if len(self.options['input-files']) == 0 : self.log.error("nothing to do") sys.exit(1) self.seqdb = SequenceDB(preprocessed=False) p = Progress("Preprocessing", len(self.options['input-files'])) p.start() samples = [] for f in self.__get_files(self.options['input-files']) : mid = self.__mid_fastq(f) sample = Sample(f, self.options['outdir'], self.seqdb, self.__filters(mid), chimeras=self.options['chimeras']) sample.print_sample() samples.append(sample) p.increment() p.end() rejected_reads = sum([ sum(s.filters.counts) for s in samples ]) accepted_reads = sum([ len(s) for s in samples ]) unique_seq = sum([ len(s.seqcounts) for s in samples ]) print "processed %s reads, accepted %d (of which %d are unique)" % \ (rejected_reads + accepted_reads, accepted_reads, unique_seq) if samples : summary_data = self.__build_summary(samples) summary_file = Summary(self.options['summary-file']) summary_file.update(summary_data) summary_file.write(self.options['summary-file']) if self.options['verbose'] : self.summary() return 0
def read_nematodes(self, fastq_fname, fprimer, rprimer, diffs, length) : tmp = [] acc2name = {} # read in sequences fq = FastqFile(fastq_fname) fq.open() for seq in fq : if 'Nematoda' not in seq.id : continue seq.ungap() seq.back_translate() new_id = seq.id.split()[0][1:] tmp.append((new_id, seq.sequence)) acc2name[new_id] = seq.id[seq.id.find('Nematoda'):] fq.close() # test sequences p = Progress("Looking for primer sequences", len(tmp), False) p.start() tmp2 = [] for label,seq in tmp : findex = IUPAC.seq_position(fprimer, seq, diffs) if findex != -1 : #if IUPAC.seq_position_reverse(rprimer, seq, diffs) != -1 : shortseq = seq[findex + len(fprimer) : findex + len(fprimer) + length] if 'N' not in shortseq : tmp2.append((label, shortseq)) p.increment() p.end() return tmp2,acc2name
def assign_scores(self, dist, keys, threshold, tree) : scores = [] p = Progress("Calculating scores", len(keys), False) p.start() for k1 in keys : tmp = [] for k2 in keys : if dist[(k1,k2)] >= threshold : tmp.append(k2) p.increment() #pdscore = self.get_phylogenetic_metric(tmp, tree) scores.append((k1, 1.0, tmp)) #scores.append((k1, pdscore, tmp)) #scores.append((k1, 1 / float(len(tmp)), tmp)) p.end() return scores
# now generate labels if method == 'blast' : for name in names : tmp = names[name][0] names[name] = "%s_%s" % (self.__get_desc_wrapper(tmp[0], method, 10), tmp[1]) p.increment() elif method == 'taxonomy' : for name in names : names[name] = "%s" % (self.__get_desc_common(names[name], method, p)) elif method == 'blastlocal' : for name in names : names[name] = "%s" % (self.__merge_taxonomy(names[name])) p.increment() p.end() return names class PyroDist(ExternalProgram) : def __init__(self) : super(PyroDist, self).__init__('PyroDist') self.command = "PyroDist -in %s -out %s -rin %s" def __lookup_file(self) : f = join(dirname(ExternalProgram.get_path(self.programname)), "LookUp.dat") if not os.path.exists(f) : raise ExternalProgramError("%s - cannot find LookUp.dat (it needs to be in the same dir as %s)" % (self.programname, self.programname)) return f