def test_cryo_diff_pipe_init(self): pipeline = Pipeline(BasicCleaner(), ) output1 = pipeline(self.docs) pipeline = Pipeline(BasicCleaner(), ) output2 = pipeline(self.docs) self.assertEqual(output1, output2) # Make sure cryo picks up on differently initialized classes pipeline = Pipeline(BasicCleaner(lowercase=False), ) output3 = pipeline(self.docs) self.assertNotEqual(output1, output3)
def test_clean(self): doc = ''' Goats are like mushrooms. If you shoot a duck, I'm scared of toasters. My site's are https://google.com. ''' expected_doc = ''' goats are like mushrooms if you shoot a duck im scared of toasters my site are ''' doc = BasicCleaner().preprocess([doc])[0] self.assertEqual(doc, expected_doc.strip())
def test_nested_pipeline(self): docs = ['<div>{}</div>'.format(d) for d in self.docs] expected = [[ 'time', 'vast', 'empty', 'space', 'time', 'continue', 'dimension', 'hold', 'nonexistence', 'great', 'spring', 'displeased', 'nicolas cage', 'existence' ], [ 'galactic', 'ocean', 'float', 'hand', 'grasp', 'look', 'glorious', 'eye', 'instantaneously', 'begin', 'stretch', 'bend', 'find', 'amusement', 'handling', 'sacred', 'galactic', 'sea', 'mighty', 'hand', 'ocean', 'sacred', 'warmth', 'mighty', 'palm', 'cage reach', 'nicolas cage', 'reach' ]] nested_pipeline = Pipeline(HTMLCleaner(), BasicCleaner(), refresh=True) pipeline = Pipeline(nested_pipeline, OverkillTokenizer(), refresh=True) output = pipeline(docs) for o, e in zip(output, expected): self.assertEqual(set(o), set(e))
def test_nested_multipipeline(self): docs = ['<div>{}</div>'.format(d) for d in self.docs] expected = [[[ 'vast', 'empty', 'space', 'time', 'continue', 'dimension', 'hold', 'nonexistence', 'great', 'spring', 'displeased', 'nicolas cage', 'existence' ], [ 'galactic', 'ocean', 'float', 'hand', 'grasp', 'look', 'glorious', 'eye', 'instantaneously', 'begin', 'stretch', 'bend', 'find', 'amusement', 'handling', 'sacred', 'galactic', 'sea', 'mighty', 'hand', 'ocean', 'sacred', 'warmth', 'mighty', 'palm', 'cage reach', 'nicolas cage', 'reach' ]], [[ 'great nicolas cage', 'vast empty', 'sprung', 'nonexistence', 'dimensions', 'held', 'existence', 'displeased', 'continue', 'time', 'space' ], [ 'sacred galactic seas', 'galactic ocean floated', 'nicolas cage reached', 'cage reached', 'sacred warmth', 'glorious eyes', 'mighty palms', 'found amusement', 'instantaneously began', 'mighty hand', 'ocean', 'hand', 'looked', 'stretch', 'grasped', 'handling', 'bend' ]]] nested_multipipeline = Pipeline( BasicCleaner(), [OverkillTokenizer(min_count=1, threshold=0.1), RAKETokenizer()], refresh=True) pipeline = Pipeline(HTMLCleaner(), nested_multipipeline, refresh=True) outputs = pipeline(docs) for i, output in enumerate(outputs): for o, e in zip(output, expected[i]): self.assertEqual(set(o), set(e))
def test_incompatible_pipeline(self): self.assertRaises(Exception, Pipeline, OverkillTokenizer(), BasicCleaner(), refresh=True)
from time import time from glob import glob from broca import Pipeline from broca.preprocess import HTMLCleaner, BasicCleaner from broca.tokenize.keyword import OverkillTokenizer from broca.knowledge.idf import train_idf from broca.knowledge.util import files_stream s = time() print('Loading documents...') files = glob('bodies/*.txt') docs = [d for d in files_stream(files)] tkn = OverkillTokenizer(n_jobs=-1) pipeline = Pipeline(HTMLCleaner(n_jobs=-1), BasicCleaner(n_jobs=-1), tkn, refresh=True) print('Computing pipeline...') tokens = pipeline(docs) print('Training IDF...') train_idf(tokens, out='nyt_idf.json') print('Took {:.2f}s'.format(time() - s)) tkn.bigram.save('nyt.bigram') tkn.trigram.save('nyt.trigram')