Ejemplo n.º 1
0
Archivo: scores.py Proyecto: ajm/seance
    def build_distance_matrix(self, fname) :
        tmp = []
        fq = FastqFile(fname)
        fq.open()

        for seq in fq :
            tmp.append((seq.id[1:], seq.sequence))

        fq.close()


        p = Progress("Calculating distance matrix", (len(tmp)*(len(tmp)-1)) / 2.0, False)
        p.start()

        dist = {}
        for index, labelseq in enumerate(tmp) :
            label1,seq1 = labelseq
            dist[(label1,label1)] = 1.0
            for label2,seq2 in tmp[index+1:] :
                dist[(label1,label2)] = \
                        dist[(label2,label1)] = \
                        self.distance([seq1,seq2])
                
                p.increment()

        p.end()

        return dist, [ label for label, seq in tmp ]
Ejemplo n.º 2
0
    def create_clusters(self, keys=None, homopolymer_correction=True) :
        seqcount = collections.Counter()

        if keys == None :
            keys = self.db

        for key in keys :
            seqcount[key] = self.db.get(key).duplicates

        p = Progress("Clustering", len(seqcount))
        p.start()

        for key,freq in seqcount.most_common() :
            clustered = False

            seq = self.db.get(key)

            for c in self.clusters :
                if self.alignment_similarity(self.db.get(c[0]), seq, homopolymer_correction) >= self.similarity_threshold :
                    c.append(key)
                    clustered = True
                    break

            if not clustered :
                self.clusters.append([key])

            self.clusters.sort(key=len, reverse=True)
            p.increment()

        p.end()
Ejemplo n.º 3
0
    def preprocess(self) :
        if len(self.options['input-files']) == 0 :
            self.log.error("nothing to do")
            sys.exit(1)

        self.seqdb = SequenceDB(preprocessed=False)

        p = Progress("Preprocessing", len(self.options['input-files']))
        p.start()

        samples = []

        for f in self.__get_files(self.options['input-files']) :
            mid = self.__mid_fastq(f)

            sample = Sample(f, 
                        self.options['outdir'],
                        self.seqdb, 
                        self.__filters(mid),
                        chimeras=self.options['chimeras'])

            sample.print_sample()
            samples.append(sample)

            p.increment()

        p.end()

        rejected_reads = sum([ sum(s.filters.counts) for s in samples ])
        accepted_reads = sum([ len(s) for s in samples ])
        unique_seq = sum([ len(s.seqcounts) for s in samples ])

        print "processed %s reads, accepted %d (of which %d are unique)" % \
                (rejected_reads + accepted_reads, accepted_reads, unique_seq)

        if samples :
            summary_data = self.__build_summary(samples)

            summary_file = Summary(self.options['summary-file'])
            summary_file.update(summary_data)
            summary_file.write(self.options['summary-file'])


        if self.options['verbose'] :
            self.summary()

        return 0
Ejemplo n.º 4
0
Archivo: scores.py Proyecto: ajm/seance
    def read_nematodes(self, fastq_fname, fprimer, rprimer, diffs, length) :
        tmp = []
        acc2name = {}

        # read in sequences
        fq = FastqFile(fastq_fname)
        fq.open()

        for seq in fq :
            if 'Nematoda' not in seq.id :
                continue
    
            seq.ungap()
            seq.back_translate()

            new_id = seq.id.split()[0][1:]
            tmp.append((new_id, seq.sequence))
    
            acc2name[new_id] = seq.id[seq.id.find('Nematoda'):]

        fq.close()


        # test sequences
        p = Progress("Looking for primer sequences", len(tmp), False)
        p.start()

        tmp2 = []

        for label,seq in tmp :
            findex = IUPAC.seq_position(fprimer, seq, diffs)
 
            if findex != -1 :
                #if IUPAC.seq_position_reverse(rprimer, seq, diffs) != -1 :

                shortseq = seq[findex + len(fprimer) : findex + len(fprimer) + length]
                if 'N' not in shortseq :
                    tmp2.append((label, shortseq))          

            p.increment()

        p.end()

        return tmp2,acc2name
Ejemplo n.º 5
0
Archivo: scores.py Proyecto: ajm/seance
    def assign_scores(self, dist, keys, threshold, tree) :
        scores = []

        p = Progress("Calculating scores", len(keys), False)
        p.start()        

        for k1 in keys :
            tmp = []
            for k2 in keys :
                if dist[(k1,k2)] >= threshold :
                    tmp.append(k2)
            p.increment()
            
            #pdscore = self.get_phylogenetic_metric(tmp, tree)

            scores.append((k1, 1.0, tmp))
            #scores.append((k1, pdscore, tmp))
            #scores.append((k1, 1 / float(len(tmp)), tmp))
        
        p.end()        

        return scores
Ejemplo n.º 6
0
Archivo: tools.py Proyecto: ajm/seance

            if method == 'blastlocal' :
                names[name].append(acc2tax.get(fields[1], "unknown"))
            else :
                try :
                    desc = fields[1].split('|')

                    if re.match(".+\.\d+", desc[3]) :
                        names[name].append((desc[3], fields[2]))

                except IndexError :
                    self.log.warn("could not split line from blastn: %s" % str(fields))
                    continue

        p = Progress("OTU naming (%s)" % method, sum([len(i) for i in names.values()]) if method == 'taxonomy' else len(names))
        p.start()

        # now generate labels
        if method == 'blast' :
            for name in names :
                tmp = names[name][0]
                names[name] = "%s_%s" % (self.__get_desc_wrapper(tmp[0], method, 10), tmp[1])
                p.increment()
        elif method == 'taxonomy' :
            for name in names :
                names[name] = "%s" % (self.__get_desc_common(names[name], method, p))
        elif method == 'blastlocal' :
            for name in names :
                names[name] = "%s" % (self.__merge_taxonomy(names[name]))
                p.increment()