def test_sample_db(self): sample_path = os.path.join(os.path.dirname(__file__), "sample.discodb") dataset = DataSet() server = DiscoDBServer(docs=sample_path) results = list(server.evaluate(LoadOp('docs'))) eq_(len(results), 150)
def test_discodb_server(self): dataset = DataSet() server = DiscoDBServer(users=self.db) # return all records from the users table results = server.evaluate(LoadOp('users')) assert_sequence_equal(list(results), [ ('John', 2), ('Bob', 1), ])
def test_sample_db(self): sample_path = os.path.join( os.path.dirname(__file__), "sample.discodb" ) dataset = DataSet() server = DiscoDBServer(docs=sample_path) results = list(server.evaluate(LoadOp('docs'))) eq_(len(results), 150)
def test_discodb_server(self): dataset = DataSet() server = DiscoDBServer(users=self.db) # return all records from the users table results = server.evaluate(LoadOp('users')) assert_sequence_equal( list(results), [ ('John', 2), ('Bob',1), ] )
def init(**tables): """ Returns a dataset to work with the discodb specified by path """ dataset = DataSet() dataset.add_server(DiscoDBServer(**tables)) dataset.add_server(FileServer( common_crawl=dict( root_dir=join(dirname(__file__), 'data'), pattern="sample.arc.gz", decode="application/x-arc", #description="Raw documents from http://commoncrawl.org" ) )) dataset.add_server(FileServer( top_sites=dict( #description="Top Sites as reported by Alexa", root_dir=join(dirname(__file__), 'data'), pattern="alexa-top1m-{date}.csv", decode="auto", schema=dict( fields=[ dict(name="date", type="DATE"), dict(name="rank", type="STRING"), dict(name="site", type="STRING") ] ) ) )) dataset.frm('top_sites').limit(10).create_view('top_10') dataset.create_view( 'outbound_links', "select link_to, count() " "from flatten(docs, 'link_to') " "group by link_to order by count desc" ) dataset.create_view( 'scripts', "select scripts as script, count() " "from flatten(docs, 'scripts') " "group by script order by count desc" ) dataset.create_view( 'servers', "select headers_value as server_name, count() " "from flatten(docs, 'headers') " "where headers_name = 'Server' " "group by server_name order by count desc" ) return dataset