Ejemplo n.º 1
0
        # Send empty request to get first job
        url = getUrl(EMPTY_REQUEST)
        while True:
            # Logging time
            ts = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
            logging.info("[{}] URL = {}".format(ts, url))

            # Output file time
            t = int(time())

            # Scrape and process data
            html, log, data = None, None, None
            try:
                html = scraper.processUrl(url)
                try:
                    log = scraper.processLog(url)
                    data = scraper.getQuestion(html)
                except Exception as e:
                    logging.error("Bad processing:")
                    logging.error(e)
            except Exception as e:
                logging.error("Unable to get HTML:")
                logging.error(e)

            error = False
            if not all([html, log, data]):
                error = True
            if html:
                compressed_html = compressHTML(html)
                # Data sent written to disk
                output = {
Ejemplo n.º 2
0
import selenium.common

basePath = '/export/a04/wpovell/'
if 'SCRAPE_LOG_BASEPATH' in os.environ:
    basePath = os.environ['SCRAPE_LOG_BASEPATH']

with open(os.path.join(basePath, 'scrapedUrls.txt')) as f:
    inp = set(f.read().strip().split('\n'))
with open(os.path.join(basePath, 'rescrapedUrls.txt')) as f:
    done = set(f.read().strip().split('\n'))
doneF = open(os.path.join(basePath, 'rescrapedUrls.txt'), 'a')

qs = QuoraScraper()
for line in list(inp - done):
    try:
        ret = qs.processLog(line)
    except Exception:
        print("BAD: {}".format(line))
        continue
    out = StringIO()
    try:
        with gzip.GzipFile(fileobj=out, mode="w") as f:
            f.write(ret['html'].encode('utf-8'))
    except TypeError:
        print("BAD: {}".format(line))
        continue

    compressed_html = out.getvalue()
    compressed_html = binascii.b2a_hex(compressed_html).decode('utf-8')
    fn = hashlib.md5(line.encode('utf-8')).hexdigest()
    with open(os.path.join(basePath, 'logPages/{}'.format(fn)), 'w') as f:
Ejemplo n.º 3
0
	s_nl = QuoraScraper()
	s = QuoraScraper(True, args.EMAIL, args.PASS)

	f = open(args.read)
	o = open(args.output, 'a')
	with open(args.seen) as seenF:
		data = set(seenF.read().split('\n'))

	seen = open(args.seen, 'a')
	c = 0
	try:
		for line in f.readlines():
			fn, url, hasData, hasTime = json.loads(line)
			if fn in data or not hasData:
				continue
			if not hasTime:
				hasTime = s_nl.processLog(url)
			topics = s.getRealTopics(url)
			o.write(json.dumps((fn, url, hasTime, topics)) + '\n')
			seen.write(fn + '\n')
			print(c)
			c += 1
	except KeyboardInterrupt:
		pass
	finally:
		s_nl.close()
		s.close()

		f.close()
		o.close()
		seen.close()