コード例 #1
0
    def test_cryo_diff_pipe_init(self):
        pipeline = Pipeline(BasicCleaner(), )
        output1 = pipeline(self.docs)

        pipeline = Pipeline(BasicCleaner(), )
        output2 = pipeline(self.docs)
        self.assertEqual(output1, output2)

        # Make sure cryo picks up on differently initialized classes
        pipeline = Pipeline(BasicCleaner(lowercase=False), )
        output3 = pipeline(self.docs)
        self.assertNotEqual(output1, output3)
コード例 #2
0
    def test_clean(self):
        doc = '''
        Goats are like mushrooms. If you shoot a duck, I'm scared of toasters. My site's are https://google.com.
        '''
        expected_doc = '''
        goats are like mushrooms if you shoot a duck im scared of toasters my site are
        '''

        doc = BasicCleaner().preprocess([doc])[0]
        self.assertEqual(doc, expected_doc.strip())
コード例 #3
0
 def test_nested_pipeline(self):
     docs = ['<div>{}</div>'.format(d) for d in self.docs]
     expected = [[
         'time', 'vast', 'empty', 'space', 'time', 'continue', 'dimension',
         'hold', 'nonexistence', 'great', 'spring', 'displeased',
         'nicolas cage', 'existence'
     ],
                 [
                     'galactic', 'ocean', 'float', 'hand', 'grasp', 'look',
                     'glorious', 'eye', 'instantaneously', 'begin',
                     'stretch', 'bend', 'find', 'amusement', 'handling',
                     'sacred', 'galactic', 'sea', 'mighty', 'hand', 'ocean',
                     'sacred', 'warmth', 'mighty', 'palm', 'cage reach',
                     'nicolas cage', 'reach'
                 ]]
     nested_pipeline = Pipeline(HTMLCleaner(), BasicCleaner(), refresh=True)
     pipeline = Pipeline(nested_pipeline, OverkillTokenizer(), refresh=True)
     output = pipeline(docs)
     for o, e in zip(output, expected):
         self.assertEqual(set(o), set(e))
コード例 #4
0
 def test_nested_multipipeline(self):
     docs = ['<div>{}</div>'.format(d) for d in self.docs]
     expected = [[[
         'vast', 'empty', 'space', 'time', 'continue', 'dimension', 'hold',
         'nonexistence', 'great', 'spring', 'displeased', 'nicolas cage',
         'existence'
     ],
                  [
                      'galactic', 'ocean', 'float', 'hand', 'grasp', 'look',
                      'glorious', 'eye', 'instantaneously', 'begin',
                      'stretch', 'bend', 'find', 'amusement', 'handling',
                      'sacred', 'galactic', 'sea', 'mighty', 'hand',
                      'ocean', 'sacred', 'warmth', 'mighty', 'palm',
                      'cage reach', 'nicolas cage', 'reach'
                  ]],
                 [[
                     'great nicolas cage', 'vast empty', 'sprung',
                     'nonexistence', 'dimensions', 'held', 'existence',
                     'displeased', 'continue', 'time', 'space'
                 ],
                  [
                      'sacred galactic seas', 'galactic ocean floated',
                      'nicolas cage reached', 'cage reached',
                      'sacred warmth', 'glorious eyes', 'mighty palms',
                      'found amusement', 'instantaneously began',
                      'mighty hand', 'ocean', 'hand', 'looked', 'stretch',
                      'grasped', 'handling', 'bend'
                  ]]]
     nested_multipipeline = Pipeline(
         BasicCleaner(),
         [OverkillTokenizer(min_count=1, threshold=0.1),
          RAKETokenizer()],
         refresh=True)
     pipeline = Pipeline(HTMLCleaner(), nested_multipipeline, refresh=True)
     outputs = pipeline(docs)
     for i, output in enumerate(outputs):
         for o, e in zip(output, expected[i]):
             self.assertEqual(set(o), set(e))
コード例 #5
0
 def test_incompatible_pipeline(self):
     self.assertRaises(Exception,
                       Pipeline,
                       OverkillTokenizer(),
                       BasicCleaner(),
                       refresh=True)
コード例 #6
0
from time import time
from glob import glob
from broca import Pipeline
from broca.preprocess import HTMLCleaner, BasicCleaner
from broca.tokenize.keyword import OverkillTokenizer
from broca.knowledge.idf import train_idf
from broca.knowledge.util import files_stream

s = time()
print('Loading documents...')
files = glob('bodies/*.txt')
docs = [d for d in files_stream(files)]

tkn = OverkillTokenizer(n_jobs=-1)

pipeline = Pipeline(HTMLCleaner(n_jobs=-1),
                    BasicCleaner(n_jobs=-1),
                    tkn,
                    refresh=True)

print('Computing pipeline...')
tokens = pipeline(docs)

print('Training IDF...')
train_idf(tokens, out='nyt_idf.json')

print('Took {:.2f}s'.format(time() - s))

tkn.bigram.save('nyt.bigram')
tkn.trigram.save('nyt.trigram')