Esempio n. 1
0
    def setUp(self):
        self._url = 'https://www.google.co.jp/search?q=ruby'
        self._testdatadir = "./tests/data/"
        self._transaction_id = "transaction_test"
        self._validator = CategoryValidator(self._url, self._transaction_id)
        self._scrapyer = GoogSearchScrapyer()
        self._scoreler = StdScoreler()

        self._relatedurls = ["https://facebook.com", "https://instagram.com"]
        #categorysetterexe
        exepath = "ls"
        self._infile = self._testdatadir + self._transaction_id + "1.scraped"
        self._outfile = self._testdatadir + self._transaction_id + "1.categorized"
        self._categorysetter = CategorySetterExe(exepath, self._infile,
                                                 self._outfile)
        rm_quoat = lambda val: re.sub(r'\"', '', val)
Esempio n. 2
0
    i=0
    with open(self._no_cat_url_list) as fo:
        with JSONStreamWriter.ArrayWriter(result_json) as jstream:
            for url in fo:
                if url.strip()== "" : continue
                    
                    transaction_id = str(time.time()).replace(".","_")
                    try:
                        #set categorysetter
                        infile  = tmpdir + "/" + self._transaction_id + str(i+1) + ".scraped"
                        outfile = tmpdir + "/" + self._transaction_id + str(i+1) + ".categorized"
                        categorysetter = CategorySetterExe(exepath,infile,outfile)
                    
                        #validate URL
                        wkValidator  = CategoryValidator(url,tmpdir + "/" + transaction_id + str(i))
                        category = wkValidator.do(scraper,scoreler,categorysetter)

                        #output result to a jsonformatted file.
                        writer.output(result_json,wkValidator)
                        #evt = fmter.Urls2Json(url,category,wkValidator.getDetail(),transaction_id)
                        #jstream.write(evt)

                        #delete tmpfile .straped and .categorized
                        #under construction
                        os.remove(infile)
                        os.remove(outfile)

                        i=i+1
                    except Exception as e:
                        print e