Example #1
0
    def setUp(self):
        self._url = 'https://www.google.co.jp/search?q=ruby'
        self._testdatadir = "./tests/data/"
        self._transaction_id = "transaction_test"
        self._validator = CategoryValidator(self._url, self._transaction_id)
        self._scrapyer = GoogSearchScrapyer()
        self._scoreler = StdScoreler()

        self._relatedurls = ["https://facebook.com", "https://instagram.com"]
        #categorysetterexe
        exepath = "ls"
        self._infile = self._testdatadir + self._transaction_id + "1.scraped"
        self._outfile = self._testdatadir + self._transaction_id + "1.categorized"
        self._categorysetter = CategorySetterExe(exepath, self._infile,
                                                 self._outfile)
        rm_quoat = lambda val: re.sub(r'\"', '', val)
    def setUp(self):
	self._url = 'https://www.google.co.jp/search?q=ruby'
        self._testdatadir    = "./tests/data/"
        self._transaction_id = "transaction_test"
        self._validator= CategoryValidator(self._url,self._transaction_id)
        self._scrapyer       = GoogSearchScrapyer()
        self._scoreler       = StdScoreler()

        self._relatedurls = ["https://facebook.com","https://instagram.com"]
        #categorysetterexe
        exepath = "ls"
        self._infile  = self._testdatadir + self._transaction_id + "1.scraped"
        self._outfile = self._testdatadir + self._transaction_id + "1.categorized"
        self._categorysetter = CategorySetterExe(exepath,self._infile,self._outfile)
        rm_quoat   = lambda val: re.sub(r'\"','',val)
Example #3
0
class TestCategoryValidator(unittest.TestCase):
    def setUp(self):
        self._url = 'https://www.google.co.jp/search?q=ruby'
        self._testdatadir = "./tests/data/"
        self._transaction_id = "transaction_test"
        self._validator = CategoryValidator(self._url, self._transaction_id)
        self._scrapyer = GoogSearchScrapyer()
        self._scoreler = StdScoreler()

        self._relatedurls = ["https://facebook.com", "https://instagram.com"]
        #categorysetterexe
        exepath = "ls"
        self._infile = self._testdatadir + self._transaction_id + "1.scraped"
        self._outfile = self._testdatadir + self._transaction_id + "1.categorized"
        self._categorysetter = CategorySetterExe(exepath, self._infile,
                                                 self._outfile)
        rm_quoat = lambda val: re.sub(r'\"', '', val)

    def test_categorysetterexe(self):
        #set relatedurls(list)
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do()
        categorized_urls = self._categorysetter.items()
        self.assertEqual(len(categorized_urls), 4)

    #success
    #use mock in all classes
    def test_success_all_mock(self):
        #scraper mock
        self._scrapyer.target(self._url)
        self._scrapyer.do = Mock()
        self._scrapyer.do.return_value = 0
        self._scrapyer.getRelatedUrl = Mock()
        self._scrapyer.getRelatedUrl.return_value = self._relatedurls
        #scoreler mock
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.return_value = "searchengine"

        #categorysetter mock
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()

        category = self._validator.do(self._scrapyer, self._scoreler,
                                      self._categorysetter)
        self.assertEqual(category, "searchengine")

        #outputjson
        writer = Url2JsonWriter()
        writer.output(self._transaction_id + ".json", self._validator)

    #use mock in scoreler and categorysetter
    def test_success_scoreler_categorysetter_mock(self):
        #scrapyer real internet access.
        self._scrapyer.target(self._url)

        #scoreler mock
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.return_value = "searchengine"

        #categorysetter mock
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()

        category = self._validator.do(self._scrapyer, self._scoreler,
                                      self._categorysetter)
        print self._validator.getDetail()
        self.assertEqual(category, "searchengine")

    #use mock in categorysetter
    def test_success_categorysetter_mock(self):
        #scrapyer real internet access.
        self._scrapyer.target(self._url)

        #categorysetter mock
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()
        #self._categorysetter.do()
        category = self._validator.do(self._scrapyer, self._scoreler,
                                      self._categorysetter)

        print self._validator.getDetail()
        self.assertEqual(category, "lang")

    #error
    #use mock in all classes
    def test_error_categoryvalidator(self):
        #scraper mock cause exception
        self._scrapyer.target(self._url)
        self._scrapyer.do = Mock()
        self._scrapyer.do.side_effect = Exception
        self._scrapyer.getRelatedUrl = Mock()
        self._scrapyer.getRelatedUrl.return_value = self._relatedurls
        #scoreler mock
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.return_value = "searchengine"

        #categorysetter mock
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()

        self._categorysetter.setData(self._relatedurls)
        #self._categorysetter.do()
        with self.assertRaises(Exception) as cnmgr:
            self._validator.do(self._scrapyer, self._scoreler,
                               self._categorysetter)
        print cnmgr.exception

    #use mock in scoreler and categorysetter
    def test_error_categoryvalidator2(self):
        #scraper mock
        self._scrapyer.target(self._url)
        self._scrapyer.do = Mock()
        self._scrapyer.do.return_value = 0
        self._scrapyer.getRelatedUrl = Mock()
        self._scrapyer.getRelatedUrl.return_value = self._relatedurls
        #scoreler mock cause exception
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.side_effect = Exception

        #categorysetter mock
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()

        self._categorysetter.setData(self._relatedurls)
        #self._categorysetter.do()
        with self.assertRaises(Exception) as cnmgr:
            self._validator.do(self._scrapyer, self._scoreler,
                               self._categorysetter)
        print cnmgr.exception
class TestCategoryValidator(unittest.TestCase):
    def setUp(self):
	self._url = 'https://www.google.co.jp/search?q=ruby'
        self._testdatadir    = "./tests/data/"
        self._transaction_id = "transaction_test"
        self._validator= CategoryValidator(self._url,self._transaction_id)
        self._scrapyer       = GoogSearchScrapyer()
        self._scoreler       = StdScoreler()

        self._relatedurls = ["https://facebook.com","https://instagram.com"]
        #categorysetterexe
        exepath = "ls"
        self._infile  = self._testdatadir + self._transaction_id + "1.scraped"
        self._outfile = self._testdatadir + self._transaction_id + "1.categorized"
        self._categorysetter = CategorySetterExe(exepath,self._infile,self._outfile)
        rm_quoat   = lambda val: re.sub(r'\"','',val)

    def test_categorysetterexe(self):
        #set relatedurls(list)
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do()
        categorized_urls = self._categorysetter.items()
        self.assertEqual(len(categorized_urls),4)
      
    #success
    #use mock in all classes
    def test_success_all_mock(self):
        #scraper mock
        self._scrapyer.target(self._url)
        self._scrapyer.do = Mock()
        self._scrapyer.do.return_value = 0
        self._scrapyer.getRelatedUrl = Mock()
        self._scrapyer.getRelatedUrl.return_value = self._relatedurls
        #scoreler mock
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.return_value = "searchengine"

        #categorysetter mock
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()             
       
        category = self._validator.do(self._scrapyer,self._scoreler,self._categorysetter)
        self.assertEqual(category,"searchengine")

        #outputjson
        writer = Url2JsonWriter()
        writer.output(self._transaction_id +".json",self._validator)
        
    #use mock in scoreler and categorysetter 
    def test_success_scoreler_categorysetter_mock(self):
        #scrapyer real internet access.
        self._scrapyer.target(self._url)
        
        #scoreler mock
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.return_value = "searchengine"

        #categorysetter mock
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()             
 

        category = self._validator.do(self._scrapyer,self._scoreler,self._categorysetter)
        print self._validator.getDetail()
        self.assertEqual(category,"searchengine")

    #use mock in categorysetter 
    def test_success_categorysetter_mock(self):
        #scrapyer real internet access.
        self._scrapyer.target(self._url)

        #categorysetter mock
        self._categorysetter.setData(self._relatedurls)
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()             
        #self._categorysetter.do()
        category = self._validator.do(self._scrapyer,self._scoreler,self._categorysetter)

        print self._validator.getDetail()
        self.assertEqual(category,"lang")

    #error
    #use mock in all classes
    def test_error_categoryvalidator(self):
        #scraper mock cause exception
        self._scrapyer.target(self._url)
        self._scrapyer.do = Mock()
        self._scrapyer.do.side_effect = Exception
        self._scrapyer.getRelatedUrl = Mock()
        self._scrapyer.getRelatedUrl.return_value = self._relatedurls
        #scoreler mock
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.return_value = "searchengine"

        #categorysetter mock
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()             
        
        self._categorysetter.setData(self._relatedurls)
        #self._categorysetter.do()
        with self.assertRaises(Exception) as cnmgr:
            self._validator.do(self._scrapyer,self._scoreler,self._categorysetter)
        print cnmgr.exception

    #use mock in scoreler and categorysetter 
    def test_error_categoryvalidator2(self):
        #scraper mock
        self._scrapyer.target(self._url)
        self._scrapyer.do = Mock()
        self._scrapyer.do.return_value = 0
        self._scrapyer.getRelatedUrl = Mock()
        self._scrapyer.getRelatedUrl.return_value = self._relatedurls
        #scoreler mock cause exception
        self._scoreler.analyze = Mock()
        self._scoreler.analyze.side_effect = Exception

        #categorysetter mock
        self._categorysetter.do = Mock()
        self._categorysetter.return_value = 0
        self._categorysetter._getCategorizedUrls()             
        
        self._categorysetter.setData(self._relatedurls)
        #self._categorysetter.do()
        with self.assertRaises(Exception) as cnmgr:
            self._validator.do(self._scrapyer,self._scoreler,self._categorysetter)
        print cnmgr.exception
Example #5
0
import re
import time
from crawler.apps.categorymatcher import CategorySetterExe,CategoryValidator
from crawler.apps.scoreler import StdScoreler
from crawler.apps.scrapyer import GoogSearchScrapyer
from crawler.apps.outputwriter import Url2JsonWriter
from crawler.config.crawlersetting import configureCrawler
#import contrib.JSONStreamWriter.JSONStreamWriter as JSONStreamWriter

GLOBAL_SETTINGS = configureCrawler()

if __name__ == "__main__":

    print u"main start!" 
    try:
        scraper        = GoogSearchScrapyer()
        categorysetter = CategorySetterExe()
        scoreler       = StdScoreler()
        writer         = Url2JsonWriter()
        exepath        = GLOBAL_SETTINGS["subprocess"]["name"]
        tmpdir         = GLOBAL_SETTINGS["directory"]["transaction"]
        result_json    = GLOBAL_SETTINGS["directory"]["jsonoutput"]
        no_cat_url_list = GLOBAL_SETTINGS["directory"]["noncategorizedurls"]
    except Exception as e:
        raise Exception

    i=0
    with open(self._no_cat_url_list) as fo:
        with JSONStreamWriter.ArrayWriter(result_json) as jstream:
            for url in fo:
                if url.strip()== "" : continue