def token_count_unique(self, token, estimate=False): # if not pysary.saryer_search2(self.sary, token, len(token)): if not pysary.saryer_search2(self.sary, token): return 0 pysary.saryer_sort_occurrences(self.sary) count = pysary.saryer_count_occurrences(self.sary) nextone = 0 uniquecount = 0 streami = 0 for i in xrange(count): fileoffset = pysary.saryer_get_next_offset(self.sary) assert(fileoffset >= 0) if fileoffset < nextone: continue uniquecount += 1 if estimate: nextone = fileoffset + int(self.length/self.numstreams) # hack else: (streami, nextone) = self.offset_to_index(fileoffset)#, start=streami) # print streami, fileoffset, nextone # sys.stdin.readline() if not nextone: break return uniquecount
def token_count_unique(self, token, estimate=False): # if not pysary.saryer_search2(self.sary, token, len(token)): if not pysary.saryer_search2(self.sary, token): return 0 pysary.saryer_sort_occurrences(self.sary) count = pysary.saryer_count_occurrences(self.sary) nextone = 0 uniquecount = 0 streami = 0 for i in xrange(count): fileoffset = pysary.saryer_get_next_offset(self.sary) assert fileoffset >= 0 if fileoffset < nextone: continue uniquecount += 1 if estimate: nextone = fileoffset + int(self.length / self.numstreams) # hack else: (streami, nextone) = self.offset_to_index(fileoffset) # , start=streami) # print streami, fileoffset, nextone # sys.stdin.readline() if not nextone: break return uniquecount
def token_count(self, token): # if not pysary.saryer_search2(self.sary, token, len(token)): if not pysary.saryer_search2(self.sary, token): return 0 return pysary.saryer_count_occurrences(self.sary)