def save(self, filename): words = [self.words[s] for s in self.words] words.sort(key=lambda x: x.count, reverse=True) output = [] for stem in words: row = [stem.stem, stem.count] for record in stem.dump(): output.append(row + list(record)) ascmini.csv_save(output, filename) return True
def test3(): import ascmini rows = ascmini.csv_load('bnc-clear.csv') output = [] words = {} for row in rows: root = row[0] size = int(row[1]) c5 = row[2] word = row[3].lower() count = int(row[4]) if word == root: continue if not root in words: stem = WordRoot(root) words[root] = stem else: stem = words[root] stem.add('*', word, count) stem.count = size fp = open('bnc-lemma.txt', 'w') lemmas = [] for key in words: stem = words[key] part = [] for c5, word, count in stem.dump(): output.append((stem.root, stem.count, c5, word, count)) part.append('%s/%d' % (word, count)) if not part: continue text = '%s/%d -> ' % (stem.root, stem.count) lemmas.append((stem.count, stem.root, text + ','.join(part))) output.sort(key=lambda x: (x[1], x[0]), reverse=True) lemmas.sort(reverse=True) for _, _, text in lemmas: fp.write(text + '\n') ascmini.csv_save(output, 'bnc-test.csv') print 'count', len(words) return 0