def setUp(self): self._url = 'https://www.google.co.jp/search?q=ruby' self._testdatadir = "./tests/data/" self._transaction_id = "transaction_test" self._validator = CategoryValidator(self._url, self._transaction_id) self._scrapyer = GoogSearchScrapyer() self._scoreler = StdScoreler() self._relatedurls = ["https://facebook.com", "https://instagram.com"] #categorysetterexe exepath = "ls" self._infile = self._testdatadir + self._transaction_id + "1.scraped" self._outfile = self._testdatadir + self._transaction_id + "1.categorized" self._categorysetter = CategorySetterExe(exepath, self._infile, self._outfile) rm_quoat = lambda val: re.sub(r'\"', '', val)
import time from crawler.apps.categorymatcher import CategorySetterExe,CategoryValidator from crawler.apps.scoreler import StdScoreler from crawler.apps.scrapyer import GoogSearchScrapyer from crawler.apps.outputwriter import Url2JsonWriter from crawler.config.crawlersetting import configureCrawler #import contrib.JSONStreamWriter.JSONStreamWriter as JSONStreamWriter GLOBAL_SETTINGS = configureCrawler() if __name__ == "__main__": print u"main start!" try: scraper = GoogSearchScrapyer() categorysetter = CategorySetterExe() scoreler = StdScoreler() writer = Url2JsonWriter() exepath = GLOBAL_SETTINGS["subprocess"]["name"] tmpdir = GLOBAL_SETTINGS["directory"]["transaction"] result_json = GLOBAL_SETTINGS["directory"]["jsonoutput"] no_cat_url_list = GLOBAL_SETTINGS["directory"]["noncategorizedurls"] except Exception as e: raise Exception i=0 with open(self._no_cat_url_list) as fo: with JSONStreamWriter.ArrayWriter(result_json) as jstream: for url in fo: if url.strip()== "" : continue