def run_search(self): """ One-time Graph API search for historical collections """ self.running = True while self.running: # First, check to see if the thread has been set to shut down. If so, break thread_status = self.e.isSet() if thread_status: self.c.log('Collection thread set to shut down. Shutting down.', thread=self.thread) self.running = False break # Loop thru each term and queries the API for id in self.id_list: since = self.params['since'] until = self.params['until'] # Check to see if there's a last value less than until. If so, replace b/c got cut off # mid collection last time last = self.params['last'] if last: if until is None or last < until: until = last # Loop thru until the paging has finished paging_url = 'none' while paging_url is not None: try: if paging_url is not 'none': resp = requests.get(paging_url) resp = json.loads(resp.content) else: resp = self.fb.get_object_feed(id, since=since, until=until) # If there is not data and no more pages, the historical search has completed, so shut down if 'paging' not in resp.keys() or not resp['data']: self.c.log('Historical query collection completed for term: %s.' % str(id), thread=self.thread) paging_url = None pass # If there's data, process it elif resp['data']: self.on_data(resp['data']) # Set the paging url for the next loop thru paging_url = resp['paging']['next'] # On error, logs and records in DB except FacebookError as e: self.c.log('Query for term ID %s failed.' % str(id), thread=self.thread, level='error') now = time.strftime('%Y-%m-%d %H:%M:%S') self.project_db.update({'_id': ObjectId(self.collector_id)}, {'$push': {'error_codes': {'code': e[0]['code'], 'message': e[0]['message'], 'date': now}}}) self.c.log('Historical query collection completed for all terms. Shutting down.', thread=self.thread) db = DB() db.set_collector_status(self.c.project_id, self.collector_id, collector_status=0) self.running = False
class Controller(object): """ Controller - A class for controlling STACK processes. Calls the Process() class to start and stop STACK processes. """ def __init__(self, process, cmdline=False, home_dir='.', umask=0o22, verbose=1, **kwargs): self.db = DB() self.process = process self.cmdline = cmdline self.usage_message = 'controller collect|process|insert start|stop|restart project_id collector_id' self.home_dir = home_dir self.umask = umask self.verbose = verbose if self.cmdline is False: # Grab information from Flask user object self.project = kwargs['project'] self.project_id = self.project['project_id'] self.project_name = self.project['project_name'] else: # Command is coming from the command line, look up info self.project_id = kwargs['project_id'] resp = self.db.get_project_detail(self.project_id) if resp['status']: self.project_name = resp['project_name'] else: print('Project w/ ID %s not found!' % self.project_id) print('') print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) sys.exit(1) # Project account DB connection project_info = self.db.get_project_detail(self.project_id) configdb = project_info['project_config_db'] project_config_db = self.db.connection[configdb] self.projectdb = project_config_db.config # Loads info for process based on type: collector, processor, inserter if self.process in ['process', 'insert']: # Only module type needed for processor / inserter self.module = kwargs['network'] self.collector_id = None # Set name for worker based on gathered info self.process_name = self.project_name + '-' + self.process + '-' + self.module + '-' + self.project_id elif process == 'collect': # For collectors, also grabs: collector_id, api, collector_name self.collector_id = kwargs['collector_id'] resp = self.db.get_collector_detail(self.project_id, self.collector_id) if resp['status']: collector = resp['collector'] self.module = collector['network'] self.api = collector['api'] self.collector_name = collector['collector_name'] else: print('Collector (ID: %s) not found!' % self.collector_id) print('') print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) sys.exit(1) # Set name for worker based on gathered info self.process_name = self.project_name + '-' + self.collector_name + '-' + self.process + '-' + self.module + \ '-' + self.collector_id # Sets out directories self.piddir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/pid' self.logdir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/logs' self.stddir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/std' # Sets data dirs # TODO - deprecate w/ Facebook self.rawdir = app.config[ 'DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + '/raw' self.archdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ '/archive' self.insertdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ '/insert_queue' # self.rawdir = "Users/harshita/Downloads/20200608-09-ClimateChange1-5ec8d268f57962092d668731\-5ec8d3cef57962093753ca79-tweets_out.json" # self.archdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ # '/archive' # self.insertdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ # '/insert_queue' # Creates dirs if they don't already exist if not os.path.exists(self.piddir): os.makedirs(self.piddir) if not os.path.exists(self.stddir): os.makedirs(self.stddir) # These directories only need be created for Twitter # TODO - deprecate w/ Facebook if self.module == 'twitter': if not os.path.exists(self.logdir): os.makedirs(self.logdir) if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) if not os.path.exists(self.archdir): os.makedirs(self.archdir) if not os.path.exists(self.insertdir): os.makedirs(self.insertdir) # Sets outfiles self.pidfile = self.piddir + '/%s.pid' % self.process_name self.stdout = self.stddir + '/%s-stdout.txt' % self.process_name self.stderr = self.stddir + '/%s-stderr.txt' % self.process_name self.stdin = self.stddir + '/%s-stdin.txt' % self.process_name # Creates the std files for the daemon if not os.path.isfile(self.stdout): create_file = open(self.stdout, 'w') create_file.close() if not os.path.isfile(self.stdin): create_file = open(self.stdin, 'w') create_file.close() if not os.path.isfile(self.stderr): create_file = open(self.stderr, 'w') create_file.close() def process_command(self, cmd): """ Prases the passed command (start / stop / restart) and initiates daemonization """ # Makes sure the command is relevant if self.cmdline and cmd not in ['start', 'stop', 'restart']: print('Invalid command: %s' % cmd) print('') print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) sys.exit(1) elif cmd == 'start': self.start() elif cmd == 'stop': self.stop() elif cmd == 'restart': self.restart() else: print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) if self.cmdline: sys.exit(1) def start(self): """ Method that starts the daemon process """ print('Initializing the STACK daemon: %s' % self.process_name) # Sets flags for given process resp = '' if self.process == 'collect': resp = self.db.set_collector_status(self.project_id, self.collector_id, collector_status=1) elif self.process == 'process': resp = self.db.set_network_status(self.project_id, self.module, run=1, process=True) elif self.process == 'insert': resp = self.db.set_network_status(self.project_id, self.module, run=1, insert=True) if 'status' in resp and resp['status']: print('Flags set.') # Check to see if running based on pidfile pid = self.get_pid() if pid: message = "pidfile %s already exists. Is it already running?\n" sys.stderr.write(message % self.pidfile) sys.exit(1) # Start the daemon self.daemonize() self.run() else: print('Failed to successfully set flags, try again.') def stop(self): """ Method that sets flags and stops the daemon process """ print('Stop command received.') print('Step 1) Setting flags on the STACK process to stop.') if self.process == 'collect': # Set flags for the STACK process to stop resp = self.db.set_collector_status(self.project_id, self.collector_id, collector_status=0) # Grab active flag from collector's Mongo document collector_conf = self.projectdb.find_one({'_id': ObjectId(self.collector_id)}) active = collector_conf['active'] else: module_conf = self.projectdb.find_one({'module': self.module}) if self.process == 'process': resp = self.db.set_network_status(self.project_id, self.module, run=0, process=True) active = module_conf['processor_active'] else: resp = self.db.set_network_status(self.project_id, self.module, run=0, insert=True) active = module_conf['inserter_active'] # TODO - mongo error handling if resp['status']: print('Step 1 complete.') # If the daemon has already stopped, then set flags and break pid = self.get_pid() if not pid: print("STACK daemon already terminated.") # Extra clean up, just in case if os.path.exists(self.pidfile): os.remove(self.pidfile) if self.process in ['process', 'insert']: if self.process == 'process': self.projectdb.update({'module': self.module}, {'$set': {'processor_active': 0}}) else: self.projectdb.update({'module': self.module}, {'$set': {'inserter_active': 0}}) else: self.projectdb.update({'_id': ObjectId(self.collector_id)}, {'$set': {'active': 0}}) return # Step 2) Check for task / STACK process completion; loops through 15 times to check print('Step 2) Check for STACK process completion and shutdown the daemon.') wait_count = 0 while active == 1: wait_count += 1 if self.process in ['process', 'insert']: module_conf = self.projectdb.find_one({'module': self.module}) if self.process == 'process': active = module_conf['processor_active'] else: active = module_conf['inserter_active'] else: collector_conf = self.projectdb.find_one({'_id': ObjectId(self.collector_id)}) active = collector_conf['active'] print('Try %d / 15' % wait_count) print('Active Status: %d' % active) print('Trying again in 5 seconds.') print('') if wait_count > 15: break time.sleep(5) # Get the pid from the pidfile pid = self.get_pid() if not pid: print("Daemon successfully stopped via thread termination.") # Just to be sure. A ValueError might occur if the PID file is # empty but does actually exist if os.path.exists(self.pidfile): os.remove(self.pidfile) return # Not an error in a restart # Try killing the daemon process print('Daemon still running w/ loose thread. Stopping now...') try: i = 0 while 1: os.kill(pid, signal.SIGTERM) time.sleep(0.1) i = i + 1 if i % 10 == 0: os.kill(pid, signal.SIGHUP) except OSError as err: err = str(err) if err.find("No such process") > 0: if os.path.exists(self.pidfile): os.remove(self.pidfile) else: print(str(err)) sys.exit(1) # Had to kill the daemon, so set the active status flag accordingly. if self.process in ['process', 'insert']: if self.process == 'process': self.projectdb.update({'module': self.module}, {'$set': {'processor_active': 0}}) else: self.projectdb.update({'module': self.module}, {'$set': {'inserter_active': 0}}) else: self.projectdb.update({'_id': ObjectId(self.collector_id)}, {'$set': {'active': 0}}) print('Stopped.') def restart(self): """ Simple restart of the daemon """ # TODO - restart w/out shutting down daemon as part of extensible processor modules self.stop() self.start() def run(self): """ Calls the process logic scripts and runs """ # Backwards compatibility for older Twitter scripts if self.module == 'twitter': if self.process == 'collect': ThreadedCollector.go(self.api, self.project_id, self.collector_id, self.rawdir, self.logdir) elif self.process == 'process': preprocess.go(self.project_id, self.rawdir, self.archdir, self.insertdir, self.logdir) elif self.process == 'insert': mongoBatchInsert.go(self.project_id, self.rawdir, self.insertdir, self.logdir) # New approach via extensible collectors else: # Dynamically import collect from os.chdir(app.config['BASEDIR']) if self.process == 'collect': _temp = __import__('app.%s.collect' % self.module, globals(), locals(), ['Collector'], -1) Collector = _temp.Collector c = Collector(self.project_id, self.collector_id, self.process_name) c.go() elif self.process == 'process': _temp = __import__('app.%s.process' % self.module, globals(), locals(), ['Processor'], -1) Processor = _temp.Processor c = Processor(self.project_id, self.process_name, self.module) c.go() elif self.process == 'insert': _temp = __import__('app.%s.insert' % self.module, globals(), locals(), ['Inserter'], -1) Inserter = _temp.Inserter c = Inserter(self.project_id, self.process_name, self.module) c.go() def daemonize(self): """ Do the UNIX double-fork magic, see Stevens' "Advanced Programming in the UNIX Environment" for details (ISBN 0201563177) http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16 """ try: pid = os.fork() if pid > 0: # Exit first parent sys.exit(0) except OSError as e: sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) # Decouple from parent environment os.chdir(self.home_dir) os.setsid() os.umask(self.umask) # Do second fork try: pid = os.fork() if pid > 0: # Exit from second parent sys.exit(0) except OSError as e: sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) sys.stdout.flush() sys.stderr.flush() si = open(self.stdin, 'r+') so = open(self.stdout, 'a+') if self.stderr: se = open(self.stderr, 'a+') else: se = so if self.cmdline: os.dup2(si.fileno(), sys.stdin.fileno()) os.dup2(so.fileno(), sys.stdout.fileno()) os.dup2(se.fileno(), sys.stderr.fileno()) sys.stderr.flush() sys.stdout.flush() def sigtermhandler(signum, frame): self.daemon_alive = False signal.signal(signal.SIGTERM, sigtermhandler) signal.signal(signal.SIGINT, sigtermhandler) if self.verbose >= 1: print("Started") # Write pidfile atexit.register( self.delpid) # Make sure pid file is removed if we quit pid = str(os.getpid()) open(self.pidfile, 'w+').write("%s\n" % pid) def delpid(self): os.remove(self.pidfile) def get_pid(self): try: pf = open(self.pidfile, 'r') pid = int(pf.read().strip()) pf.close() except IOError: pid = None except SystemExit: pid = None return pid