Ejemplo n.º 1
0
def parseLogPage(html):
    qs = QuoraScraper()
    print("Parsing html")
    parsed = pq(html)
    entries = []
    for i in parsed.find('.QuestionLog')[0].getchildren()[:-1]:
        top, bottom = i.getchildren()[0].getchildren()
        if 'AddQuestionRedirectOperationView' in top.attrib['class']:
            text = dehtml(''.join(
                map(tostring,
                    top.getchildren()[0].getchildren()[:2])))
            user = top.getchildren()[0].getchildren()[3].attrib['href']
            actionType = "MERGED"
        else:
            text = None
            if len(top) > 1:
                text = dehtml(tostring(top[1]))

            actionType = top.getchildren()[0].text
            userElm = top.getchildren()[0].getchildren()
            if len(userElm) > 1:
                user = top.getchildren()[0].getchildren()[1].attrib['href']
            else:
                user = None

        revision, date = bottom.text_content().split(u' \xe2\x80\xa2 ')
        entry = {
            "date": qs.processDate(date),
            "revision": revision,
            "user": user,
            "actionType": actionType,
            "text": text
        }
        entries.append(entry)
    return entries
Ejemplo n.º 2
0
def reparse(fn):
    '''Reparses question from stored HTML'''
    with open(fn) as f:
        entry = json.load(f)
        html_data = GzipFile(fileobj=StringIO(a2b_hex(entry['html']))).read()
        reparsed = QuoraScraper.getQuestion(html_data)
        entry['data'] = reparsed
        with open(fn, 'w') as f:
            json.dump(entry, f)
Ejemplo n.º 3
0
    # Empty string must be used here
    # localhost/hostname cause clients to be unable to connect
    HOST = ''
    PORT = args.PORT

    logging.info("Starting server on {}:{}".format(HOST, PORT))
    server = socketserver.TCPServer((HOST, PORT), ScrapeServer)

    ### ADD GLOBAL DATA TO SERVER INSTANCE ###

    # Seen links
    server.directory = {}
    # Queue
    server.urls_to_use = set()
    # Seeder
    server.urlGen = QuoraScraper.getQuestionPage()

    # Load previously scraped links
    if os.path.isfile(DIRECTORY_FILE):
        with open(DIRECTORY_FILE) as f:
            data = f.read().strip().split('\n')
            server.directory = {}
            for entry in data:
                entry = json.loads(entry)
                key = entry.keys()[0]
                server.directory[key] = entry[key]
    else:
        f = open(DIRECTORY_FILE, 'w')
        f.close()

    # Load previously saved seed links
Ejemplo n.º 4
0
import binascii
import urllib2
import os
import selenium.common

basePath = '/export/a04/wpovell/'
if 'SCRAPE_LOG_BASEPATH' in os.environ:
    basePath = os.environ['SCRAPE_LOG_BASEPATH']

with open(os.path.join(basePath, 'scrapedUrls.txt')) as f:
    inp = set(f.read().strip().split('\n'))
with open(os.path.join(basePath, 'rescrapedUrls.txt')) as f:
    done = set(f.read().strip().split('\n'))
doneF = open(os.path.join(basePath, 'rescrapedUrls.txt'), 'a')

qs = QuoraScraper()
for line in list(inp - done):
    try:
        ret = qs.processLog(line)
    except Exception:
        print("BAD: {}".format(line))
        continue
    out = StringIO()
    try:
        with gzip.GzipFile(fileobj=out, mode="w") as f:
            f.write(ret['html'].encode('utf-8'))
    except TypeError:
        print("BAD: {}".format(line))
        continue

    compressed_html = out.getvalue()
Ejemplo n.º 5
0
                        type=int,
                        default=7,
                        nargs=1,
                        help="how long to wait between requets")
    args = parser.parse_args()

    HOST = args.HOST
    PORT = args.PORT

    # Directory to write output files to
    if args.output is None:
        OUTPUT_DIRECTORY = "data"
    else:
        OUTPUT_DIRECTORY = args.output[0]

    scraper = QuoraScraper(wait=args.wait)
    logging.info("Connecting to {} on port {}".format(HOST, PORT))
    try:
        # Send empty request to get first job
        url = getUrl(EMPTY_REQUEST)
        while True:
            # Logging time
            ts = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
            logging.info("[{}] URL = {}".format(ts, url))

            # Output file time
            t = int(time())

            # Scrape and process data
            html, log, data = None, None, None
            try:
Ejemplo n.º 6
0
    if args.m:
        toGetTimes = open(args.m, 'w')

    files = getFiles(INPUT_DIR)
    files = binFiles(files)

    for fileHash, fileList in files.items():
        print(fileHash)
        outPath = '{}/{}/{}'.format(OUTPUT_DIR, fileHash[0], fileHash[1])
        if not os.path.isdir(outPath):
            os.makedirs(outPath)
        tarf = tarfile.open('{}/{}.tar.gz'.format(outPath, fileHash[2]),
                            'w:gz')
        for fn, fullHash in fileList:
            with open(fn) as f:
                data = json.load(f)
            if 'data' not in data:
                continue
            t = data['time']
            html = data['html']
            html = a2b_hex(html)
            strFile = StringIO(html)
            html = GzipFile(fileobj=strFile).read()
            info = QuoraScraper.getQuestion(html, t)
            data['data'] = info
            if args.m and not "log" in data:
                toGetTimes.write(fn + '\n')
            createEntry(fullHash, data, tarf)
        tarf.close()
Ejemplo n.º 7
0
import urllib2

from Quora.QuoraScraper import QuoraScraper

if __name__ == '__main__':
	import argparse

	parser = argparse.ArgumentParser(description='Complete previously scraped data.')
	parser.add_argument('EMAIL', help='Quora account email.')
	parser.add_argument('PASS', help='Quora account password.')
	parser.add_argument('-r', '--read', default="fileInfo.txt", help='File to read data info from.')
	parser.add_argument('-o', '--output', default="completeData.txt", help="File to write results to.")
	parser.add_argument('-s', '--seen', default="seenData.txt", help="File to write seen files to.")
	args = parser.parse_args()

	s_nl = QuoraScraper()
	s = QuoraScraper(True, args.EMAIL, args.PASS)

	f = open(args.read)
	o = open(args.output, 'a')
	with open(args.seen) as seenF:
		data = set(seenF.read().split('\n'))

	seen = open(args.seen, 'a')
	c = 0
	try:
		for line in f.readlines():
			fn, url, hasData, hasTime = json.loads(line)
			if fn in data or not hasData:
				continue
			if not hasTime: