Exemple #1
0
def flush_row(db, bucket, items):
    try:
        col = db[bucket]

        for k, lines in items.iteritems():
            parsed_lines = map(lambda line: json.loads(line), lines)
            col.insert_many(parsed_lines)

    except:
        logger.error(full_stack())

    return True
Exemple #2
0
def flush_file(output_folder, bucket, items):
    try:
        bucket_folder = os.path.abspath('%s/%s' % (output_folder, bucket))

        for k, lines in items.iteritems():
            filename = os.path.abspath('%s/%s' % (bucket_folder, k))
            with open(filename, 'ab+') as f:
                for line in lines:
                    f.write('%s\n' % line)

            logger.debug("flushed %d lines to %s" % (len(lines), filename))

    except:
        logger.error(full_stack())

    return True
Exemple #3
0
	def redistribute_crawler_queue(self, crawler_id):
		if (crawler_id in self.crawlers):
			logger.warn('%s just failed... redistributing its workload'%(crawler_id))
			try:
				self.node_coordinator.distribute_to_nodes(self.crawlers[crawler_id]['crawler_queue'])
				wait_timer = 180
				# wait until it dies (flushed all the data...)
				while(self.crawlers[crawler_id]['crawler'].is_alive() and wait_timer > 0):
					time.sleep(60)
					wait_timer -= 60

				self.crawlers[crawler_id]['retry_timer_start_ts'] = int(time.time())
			except Exception as exc:
				logger.error(full_stack())
		else:
			logger.warn("whatever are you trying to do? crawler_id: [%s] is not valid..."%(crawler_id))
def flush_file(output_folder, bucket, items):
    try:
        bucket_folder = os.path.abspath("%s/%s" % (output_folder, bucket))

        for k, lines in items.iteritems():
            filename = os.path.abspath("%s/%s" % (bucket_folder, k))
            with open(filename, "ab+") as f:
                for line in lines:
                    f.write("%s\n" % line)

            logger.debug("flushed %d lines to %s" % (len(lines), filename))

    except:
        logger.error(full_stack())

    return True
Exemple #5
0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--config',
        help=
        "config.json that contains a) twitter api keys; b) redis connection string;",
        required=True)
    parser.add_argument('-p', '--proxies', help="the proxies.json file")

    args = parser.parse_args()

    proxies = None
    if args.proxies:
        with open(os.path.abspath(args.proxies), 'r') as proxy_f:
            proxies = json.load(proxy_f)['proxies']

    with open(os.path.abspath(args.config), 'r') as config_f:
        config = json.load(config_f)

        try:
            start_server(config, proxies)
        except KeyboardInterrupt:
            print()
            logger.error('You pressed Ctrl+C!')
            pass
        except Exception as exc:
            logger.error(exc)
            logger.error(full_stack())
        finally:
            pass
		if cmd:
			scheduler.enqueue(cmd)
				

if __name__=="__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('-c', '--config', help="config.json that contains a) twitter api keys; b) redis connection string;", required = True)
	parser.add_argument('-p', '--proxies', help="the proxies.json file")

	args = parser.parse_args()

	proxies = None
	if args.proxies:
		with open(os.path.abspath(args.proxies), 'rb') as proxy_f:
			proxies = json.load(proxy_f)['proxies']

	with open(os.path.abspath(args.config), 'rb') as config_f:
		config = json.load(config_f)	
		
		try:
			start_server(config, proxies)
		except KeyboardInterrupt:
			print()
			logger.error('You pressed Ctrl+C!')
			pass
		except Exception as exc:		
			logger.error(exc)
			logger.error(full_stack())
		finally:
			pass