def flush_row(db, bucket, items): try: col = db[bucket] for k, lines in items.iteritems(): parsed_lines = map(lambda line: json.loads(line), lines) col.insert_many(parsed_lines) except: logger.error(full_stack()) return True
def flush_file(output_folder, bucket, items): try: bucket_folder = os.path.abspath('%s/%s' % (output_folder, bucket)) for k, lines in items.iteritems(): filename = os.path.abspath('%s/%s' % (bucket_folder, k)) with open(filename, 'ab+') as f: for line in lines: f.write('%s\n' % line) logger.debug("flushed %d lines to %s" % (len(lines), filename)) except: logger.error(full_stack()) return True
def redistribute_crawler_queue(self, crawler_id): if (crawler_id in self.crawlers): logger.warn('%s just failed... redistributing its workload'%(crawler_id)) try: self.node_coordinator.distribute_to_nodes(self.crawlers[crawler_id]['crawler_queue']) wait_timer = 180 # wait until it dies (flushed all the data...) while(self.crawlers[crawler_id]['crawler'].is_alive() and wait_timer > 0): time.sleep(60) wait_timer -= 60 self.crawlers[crawler_id]['retry_timer_start_ts'] = int(time.time()) except Exception as exc: logger.error(full_stack()) else: logger.warn("whatever are you trying to do? crawler_id: [%s] is not valid..."%(crawler_id))
def flush_file(output_folder, bucket, items): try: bucket_folder = os.path.abspath("%s/%s" % (output_folder, bucket)) for k, lines in items.iteritems(): filename = os.path.abspath("%s/%s" % (bucket_folder, k)) with open(filename, "ab+") as f: for line in lines: f.write("%s\n" % line) logger.debug("flushed %d lines to %s" % (len(lines), filename)) except: logger.error(full_stack()) return True
parser = argparse.ArgumentParser() parser.add_argument( '-c', '--config', help= "config.json that contains a) twitter api keys; b) redis connection string;", required=True) parser.add_argument('-p', '--proxies', help="the proxies.json file") args = parser.parse_args() proxies = None if args.proxies: with open(os.path.abspath(args.proxies), 'r') as proxy_f: proxies = json.load(proxy_f)['proxies'] with open(os.path.abspath(args.config), 'r') as config_f: config = json.load(config_f) try: start_server(config, proxies) except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') pass except Exception as exc: logger.error(exc) logger.error(full_stack()) finally: pass
if cmd: scheduler.enqueue(cmd) if __name__=="__main__": parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help="config.json that contains a) twitter api keys; b) redis connection string;", required = True) parser.add_argument('-p', '--proxies', help="the proxies.json file") args = parser.parse_args() proxies = None if args.proxies: with open(os.path.abspath(args.proxies), 'rb') as proxy_f: proxies = json.load(proxy_f)['proxies'] with open(os.path.abspath(args.config), 'rb') as config_f: config = json.load(config_f) try: start_server(config, proxies) except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') pass except Exception as exc: logger.error(exc) logger.error(full_stack()) finally: pass