Esempio n. 1
0
    def setUp(self):
        self._url = 'https://www.google.co.jp/search?q=ruby'
        self._testdatadir = "./tests/data/"
        self._transaction_id = "transaction_test"
        self._validator = CategoryValidator(self._url, self._transaction_id)
        self._scrapyer = GoogSearchScrapyer()
        self._scoreler = StdScoreler()

        self._relatedurls = ["https://facebook.com", "https://instagram.com"]
        #categorysetterexe
        exepath = "ls"
        self._infile = self._testdatadir + self._transaction_id + "1.scraped"
        self._outfile = self._testdatadir + self._transaction_id + "1.categorized"
        self._categorysetter = CategorySetterExe(exepath, self._infile,
                                                 self._outfile)
        rm_quoat = lambda val: re.sub(r'\"', '', val)
Esempio n. 2
0
import re
import time
from crawler.apps.categorymatcher import CategorySetterExe,CategoryValidator
from crawler.apps.scoreler import StdScoreler
from crawler.apps.scrapyer import GoogSearchScrapyer
from crawler.apps.outputwriter import Url2JsonWriter
from crawler.config.crawlersetting import configureCrawler
#import contrib.JSONStreamWriter.JSONStreamWriter as JSONStreamWriter

GLOBAL_SETTINGS = configureCrawler()

if __name__ == "__main__":

    print u"main start!" 
    try:
        scraper        = GoogSearchScrapyer()
        categorysetter = CategorySetterExe()
        scoreler       = StdScoreler()
        writer         = Url2JsonWriter()
        exepath        = GLOBAL_SETTINGS["subprocess"]["name"]
        tmpdir         = GLOBAL_SETTINGS["directory"]["transaction"]
        result_json    = GLOBAL_SETTINGS["directory"]["jsonoutput"]
        no_cat_url_list = GLOBAL_SETTINGS["directory"]["noncategorizedurls"]
    except Exception as e:
        raise Exception

    i=0
    with open(self._no_cat_url_list) as fo:
        with JSONStreamWriter.ArrayWriter(result_json) as jstream:
            for url in fo:
                if url.strip()== "" : continue