def download_media(self, url, f_name, size): # set format url = "{}:{}".format(url, size) try: urllib.request.urlretrieve(url, f_name) except Exception as e: report_error(self.logger, exception=True)
def get_readme(): plugs = {} try: response = urlopen( u"https://api.github.com/repos/Dan-in-CA/SIP_plugins/readme" ) data = response.read() d = json.loads(data.decode('utf-8')) text = base64.b64decode(d[u"content"]).decode(u'utf-8') t_list = text.split() sep = [i for i, s in enumerate(t_list) if u"***" in s][0] plug_list = t_list[sep + 1 :] breaks = [i for i, s in enumerate(plug_list) if u"---" in s] for i in range(len(breaks)): if i < len(breaks) - 1: plugs[plug_list[breaks[i] - 1]] = u" ".join( plug_list[breaks[i] + 1 : breaks[i + 1] - 1] ) else: plugs[plug_list[breaks[i] - 1]] = u" ".join(plug_list[breaks[i] + 1 :]) except IOError as e: report_error(U"We couldn't get readme file for github", e) return plugs
def predict(self, texts): # run prediction in batches num_docs = len(texts) output = [] can_convert_label_to_int = True for i in range(0, num_docs, self.batch_size): texts_slice = texts[i:(i + self.batch_size)] texts_slice = self.preprocess_text(texts_slice) resp = self.sagemaker.predict(self.endpoint_name, {'text': texts}) status_code = resp['ResponseMetadata']['HTTPStatusCode'] if status_code != 200: report_error( logger, msg= 'Prediction on endpoint {self.endpoint_name} unsuccessful.' ) preds = json.loads(resp['Body'].read())['predictions'] _output = [{ 'labels': _pred['labels'], 'probabilities': _pred['probabilities'] } for pred_obj, _pred in zip(texts_slice, preds)] output.extend(_output) label_vals = [ self.labels_to_int(_output['labels']) for _output in output ] if all(label_vals): output = [{ 'label_vals': _label_vals, **_output } for _output, _label_vals in zip(output, label_vals)] return output
def update(self, tweet_id, user_id): """Track the fact that user user_id classified tweet_id. This method updates the score of the item in the priority queue and adds the user_id to the tweet's redis set :param tweet_id: :param user_id: """ if not self.pq: report_error(self.logger, msg='Priority queue does not exist. Aborting.') return if not self.pq.exists(tweet_id): # This may happen relatively often when multiple people are working on the same tweet report_error(self.logger, msg='Key {} does not exist anymore. Aborting.'.format( tweet_id), level='warning') return # Change priority in queue self.pq.increment_priority(tweet_id) # remove from pqueue if below certain threshold score = self.pq.get_score(tweet_id) if score >= self.priority_threshold: self.remove(tweet_id) return # add user to set of tweet_id self.rset.add(tweet_id, user_id)
def test_connection(self): test = self.redis_conn.ping() if test: self.logger.info('Successfully connected to Redis host {}'.format(app.config['REDIS_HOST']) ) else: report_error(self.logger, msg='FAILURE: Connection to Redis host {} not successful'.format(app.config['REDIS_HOST'])) return test
def download_file(self, local_path, key): try: self._s3_client.download_file(self.bucket, key, local_path) except Exception as e: report_error(logger, exception=True) return False else: return True
def test_rollbar(): d = {} try: d['missing_key'] except: report_error(logger, exception=True) report_error(logger, msg='Test: report an arbitrary error message') return 'error reported'
def upload_to_s3(self, content, key): try: self._s3_client.put_object(Body=content, Bucket=self.bucket, Key=key) except Exception as e: report_error(logger, exception=True) return False else: return True
def file_exists(self, key): try: self._s3_client.head_object(Bucket=self.bucket, Key=key) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": return False else: report_error(logger, exception=True) return False else: return True
def parse_manifest(plugin): try: with open(u"plugins/manifests/" + plugin + u".manifest") as mf: mf_list = mf.readlines() sep = [i for i, s in enumerate(mf_list) if u"###" in s][0] desc = u"".join(mf_list[:sep]).rstrip() f_list = [line.strip() for line in mf_list[int(sep) + 2 :]] return (desc, f_list) except IOError as e: report_error(u"parse_manifest IOError", e) return (u"", [])
def remove_from_pq(project): """Remove a tweet which is now private""" data = request.get_json() logger.debug('Incoming request with data {}'.format(data)) if data is None or 'tweet_id' not in data: report_error(logger, msg='No tweet_id was passed when updating') return Response(None, status=400, mimetype='text/plain') logger.info(f"Removing {data['tweet_id']} for project {project}") tid = TweetIdQueue(project) tid.remove(data['tweet_id']) return Response('Successfully removed.', status=200, mimetype='text/plain')
def add_to_pq(project): """Update priority score in queue and remember that a user has already classified a tweet""" data = request.get_json() logger.debug('Incoming request with data {}'.format(data)) if data is None or 'user_id' not in data or 'tweet_id' not in data: report_error(logger, msg='No user_id was passed when updating ') return Response(None, status=400, mimetype='text/plain') logger.info(f"Adding {data['tweet_id']} to project {project} for user {data['user_id']}") tid = TweetIdQueue(project) tid.update(data['tweet_id'], data['user_id']) return Response('Update successful.', status=200, mimetype='text/plain')
def test_connection(self): test = self._r.ping() if test: logger.info( f'Successfully connected to Redis host {self.host}:{self.port}' ) else: report_error( logger, msg= 'FAILURE: Connection to Redis host {self.host}:{self.port} not successful' ) return test
def test_connection(self): """test_connection""" test = self.es.ping() if test: logger.info( 'Successfully connected to Elasticsearch host {}'.format( self.config['ELASTICSEARCH_HOST'])) else: report_error( logger, msg='Connection to Elasticsearch host {} not successful!'. format(self.config['ELASTICSEARCH_HOST'])) return test
def upload_file(self, local_path, key, make_public=False): extra_args = None if make_public: extra_args = {'ACL': 'public-read'} try: self._s3_client.upload_file(local_path, self.bucket, key, ExtraArgs=extra_args) except Exception as e: report_error(logger, exception=True) return False else: return True
def put_template(self, template_path, template_name): """Put template to ES """ # read template file if not os.path.exists(template_path): report_error( logger, msg='No project file found under {}'.format(template_path)) return with open(template_path, 'r') as f: template = json.load(f) res = self.es.indices.put_template(template_name, body=template, include_type_name=True) logger.info("Template {} added to Elasticsearch".format(template_path))
def on_error(self, status_code): if status_code in ERROR_CODES: msg = 'Error {}: {} {}'.format( status_code, ERROR_CODES[status_code]['text'], ERROR_CODES[status_code]['description']) report_error(self.logger, msg=msg) if status_code == 420: self.logger.info('Waiting for a bit...') self.rate_error_count += 1 # wait at least 15min time.sleep(self.rate_error_count * 15 * 60) else: report_error( self.logger, msg='Received unknown error code {}'.format(status_code)) return True # To continue listening
def bulk_actions_in_batches(self, actions, batch_size=1000): num_actions = len(actions) logger.info(f'Processing {num_actions:,} bulk actions...') for i in range(0, num_actions, batch_size): num_actions_in_batch = len(actions[i:(i + batch_size)]) try: self.bulk_action(actions[i:(i + batch_size)]) except: logger.error( f'Elasticsearch failed to process batch of {num_actions_in_batch:,} actions' ) report_error(logger, exception=True) return False else: logger.info( f'Successfully processed batch of {num_actions_in_batch:,} actions' ) return True
def get(self, user_id=None): """Get new tweet ID to classify for user ID """ # If no user is defined, simply pop the queue if user_id is None: tweet_id = self.pq.pop() if tweet_id is None: report_error(self.logger, msg='Queue is empty') return None else: return tweet_id else: tweet_id = self.retrieve_for_user(user_id) if tweet_id is None: report_error( self.logger, msg='No new tweet could be found for user_id {}'.format( user_id)) else: return tweet_id
def index_tweet(self, tweet, index_name): """Index new tweet in index name given by tweet['project']. Will not re-index already existing doc with same ID. :tweet: tweet to index """ try: self.es.index(index=index_name, id=tweet['id'], doc_type='tweet', body=tweet, op_type='create') except elasticsearch.ConflictError as e: # This usually happens when a document with the same ID already exists. logger.warning('Conflict Error') except elasticsearch.TransportError as e: report_error(logger, exception=True) else: logger.debug('Tweet with id {} sent to index {}'.format( tweet['id'], index_name))
def get_permissions(): global installed try: permissions = [] files = subprocess.check_output([u"ls", u"plugins"]) files = files.decode(u'utf-8') # to unicode string installed = [f for f in list(files.split(u"\n")) if re.match("[^_].+\.py$", f)] pm = installed.index(u"plugin_manager.py") del installed[pm] # Remove this plugin from list for p in installed: mod = subprocess.check_output([u"stat", u"-c %a", u"plugins/" + p]) mod = mod.decode(u'utf-8') permissions.append(int(list(mod.strip())[1]) % 2) settings = dict(list(zip(installed, permissions))) return settings except IOError as e: report_error(u"get_permissions IOError", e) settings = {} return settings
def main(): """Here we instantiate the stream manager, listener and connect to the Twitter streaming API.""" global stream # setting things up... logger = logging.getLogger('stream') listener = Listener() auth = get_auth() # wait for a bit before connecting, in case container will be paused logger.debug('Streaming container is ready, sleeping for a bit...') time.sleep(10) time_last_error = 0 error_count_last_hour = 0 while run: logger.debug('Trying to connect to Twitter API...') stream = StreamManager(auth, listener) try: stream.start() except KeyboardInterrupt: sys.exit() except IncompleteRead: # This error occurrs sometimes under high volume, simply reconnect stream.stop() except (TweepError, ConnectionError, ConnectionResetError, ProtocolError) as e: stream.stop() report_error(logger, exception=True) error_count_last_hour = update_error_count(error_count_last_hour, time_last_error) time_last_error = time.time() except Exception as e: stream.stop() report_error(logger, msg='Uncaught stream exception.', exception=True) error_count_last_hour = update_error_count(error_count_last_hour, time_last_error) time_last_error = time.time() # if error_count_last_hour > 10: # report_error(logger, msg='Failing to reconnect. Aborting.') # sys.exit() wait_some_time(time_last_error, error_count_last_hour) logger.info('Shutting down...')
def parse_dates(self, *dates, input_format='%Y-%m-%d %H:%M:%S', output_format='%a %b %d %H:%M:%S %z %Y'): """Used to parse for Twitter's unusual created_at date format""" res = [] for d in dates: if isinstance(d, str) and 'now' in d: res.append(d) continue try: d_date = datetime.strptime(d, input_format) except: report_error( logger, msg='Date {} is not of format {}. Using "now" instead'. format(d, input_format)) res.append('now') else: d_date = d_date.replace(tzinfo=timezone.utc) res.append(d_date.strftime(output_format)) return res
def sync(self): num_new_data = len(self) if num_new_data == 0: logger.info(f'No new data was collected. Aborting.') return logger.info(f'Writing {num_new_data:,} to file {self.data_dump_key}...') with open(self.local_file_tmp, 'w') as f: for chunk in self.pop_all_iter(): chunk = list(set(chunk)) if len(chunk) > 0: f.write('\n'.join(chunk) + '\n') logger.info(f'Collected {num_new_data:,} ids') if self.s3_handler.file_exists(self.data_dump_key): success = self.download_existing_data_dump() if not success: logger.error(f'Something went wrong when trying to download the existing data. Aborting.') return # decompress data decompress(self.local_file_compr, self.local_file) os.remove(self.local_file_compr) # concatenating new data with open(self.local_file, 'a') as f: shutil.copyfileobj(open(self.local_file_tmp, 'r'), f) else: # There is no existing data, simply rename file os.rename(self.local_file_tmp, self.local_file) # reuploading file logger.info(f'Compressing file...') compress(self.local_file, self.local_file_compr) logger.info(f'Uploading file to S3 under key {self.data_dump_key}') success = self.s3_handler.upload_file(self.local_file_compr, self.data_dump_key, make_public=True) if not success: report_error(logger, msg='Uploading data dump Ids file to S3 unsuccessful.') # cleanup logger.info('Cleaning up temporary files...') for f in [self.local_file, self.local_file_tmp, self.local_file_compr]: if os.path.isfile(f): os.remove(f)
def get_random_document(self, index_name, doc_type='tweet'): body = { 'query': { 'function_score': { 'functions': [{ 'random_score': {} }] } } } res = self.es.search(index=index_name, doc_type=doc_type, body=body, size=1, filter_path=['hits.hits']) hits = res['hits']['hits'] if len(hits) == 0: report_error( logger, msg='Could not find a random document in index {}'.format( index_name)) return None return hits[0]['_source']
def get_new_tweet(project): """"Get new tweet from priority queue""" user_id = request.args.get('user_id', None) fields = request.args.get('fields', ['id', 'text']) logger.info(f"Getting tweet for project {project} for user {user_id}") tid = TweetIdQueue(project) tweet = tid.get_tweet(user_id=user_id) if tweet is None: msg = 'Could not get tweet id from priority queue. Getting random tweet from ES instead.' report_error(logger, msg=msg) # get a random tweet instead tweet = es.get_random_document(project) if tweet is None: msg = 'Could not get random tweet from elasticsearch.' report_error(logger, msg=msg) return jsonify({'error': msg}), 400 tweet = {k: tweet.get(k) for k in fields} if 'id' in tweet: # rename fields tweet['tweet_id'] = str(tweet.pop('id')) tweet['tweet_text'] = tweet.pop('text') logger.info(f"Retrieving tweet {tweet['tweet_id']} for user {user_id}") return jsonify(tweet)
def remove_lowest_priority(self, random_deletion=True): """Remove key with the lowest priority""" if not random_deletion: # Just delete lowest priority key items = self._r.zrevrange(self.key, 0, 0, withscores=True) num_deleted = self._r.zremrangebyrank(self.key, 0, 0) if num_deleted == 0: report_error( self.logger, msg= 'Tried to remove lowest ranking element but queue is empty.', level='warning') return items # Remove a random lowest priority key items = self._r.zrevrange(self.key, 0, 0, withscores=True) if len(items) == 0: report_error( self.logger, msg= 'Tried to remove lowest ranking element but queue is empty.', level='warning') return lowest_score = items[0][1] num_elements = self._r.zcount(self.key, lowest_score, lowest_score) if num_elements == 0: msg = 'Element with score {} could not be found. Possibly it has been removed before. Aborting.'.format( lowest_score) self.logger.warning(msg) return elif num_elements == 1: items = self._r.zrevrange(self.key, 0, 0, withscores=True) self._r.zremrangebyrank(self.key, 0, 0) else: # multiple elements with the same lowest score, randomly remove one rand_index = random.randint(0, num_elements - 1) self.logger.debug( 'Picked {} as randindex between {} and {}'.format( rand_index, 0, num_elements - 1)) items = self._r.zrange(self.key, rand_index, rand_index, withscores=True) num_deleted = self._r.zremrangebyrank(self.key, rand_index, rand_index) if num_deleted != 1: report_error( self.logger, msg= 'Random key could not be deleted because it does not exist anymore' ) return items
def remove(self, item): """Remove key by keyname""" if self._r.zrem(self.key, item) == 0: report_error(self.logger, msg='Element {} could not be deleted'.format( item.decode()))
def on_warning(self, notice): report_error(self.logger, msg=notice, level='warning')
def on_timeout(self): report_error(self.logger, msg='Stream listener has timed out') return True # To continue listening