Beispiel #1
0
 def download_media(self, url, f_name, size):
     # set format
     url = "{}:{}".format(url, size)
     try:
         urllib.request.urlretrieve(url, f_name)
     except Exception as e:
         report_error(self.logger, exception=True)
Beispiel #2
0
def get_readme():
    plugs = {}
    try:
        response = urlopen(
            u"https://api.github.com/repos/Dan-in-CA/SIP_plugins/readme"
        )
        data = response.read()
        d = json.loads(data.decode('utf-8'))
        text = base64.b64decode(d[u"content"]).decode(u'utf-8')
        t_list = text.split()
        sep = [i for i, s in enumerate(t_list) if u"***" in s][0]
        plug_list = t_list[sep + 1 :]
        breaks = [i for i, s in enumerate(plug_list) if u"---" in s]


        for i in range(len(breaks)):
            if i < len(breaks) - 1:
                plugs[plug_list[breaks[i] - 1]] = u" ".join(
                    plug_list[breaks[i] + 1 : breaks[i + 1] - 1]
                )
            else:
                plugs[plug_list[breaks[i] - 1]] = u" ".join(plug_list[breaks[i] + 1 :])
    except IOError as e:
        report_error(U"We couldn't get readme file for github", e)

    return plugs
Beispiel #3
0
 def predict(self, texts):
     # run prediction in batches
     num_docs = len(texts)
     output = []
     can_convert_label_to_int = True
     for i in range(0, num_docs, self.batch_size):
         texts_slice = texts[i:(i + self.batch_size)]
         texts_slice = self.preprocess_text(texts_slice)
         resp = self.sagemaker.predict(self.endpoint_name, {'text': texts})
         status_code = resp['ResponseMetadata']['HTTPStatusCode']
         if status_code != 200:
             report_error(
                 logger,
                 msg=
                 'Prediction on endpoint {self.endpoint_name} unsuccessful.'
             )
         preds = json.loads(resp['Body'].read())['predictions']
         _output = [{
             'labels': _pred['labels'],
             'probabilities': _pred['probabilities']
         } for pred_obj, _pred in zip(texts_slice, preds)]
         output.extend(_output)
     label_vals = [
         self.labels_to_int(_output['labels']) for _output in output
     ]
     if all(label_vals):
         output = [{
             'label_vals': _label_vals,
             **_output
         } for _output, _label_vals in zip(output, label_vals)]
     return output
Beispiel #4
0
    def update(self, tweet_id, user_id):
        """Track the fact that user user_id classified tweet_id.
        This method updates the score of the item in the priority queue and adds the user_id to the tweet's redis set

        :param tweet_id:
        :param user_id:
        """
        if not self.pq:
            report_error(self.logger,
                         msg='Priority queue does not exist. Aborting.')
            return
        if not self.pq.exists(tweet_id):
            # This may happen relatively often when multiple people are working on the same tweet
            report_error(self.logger,
                         msg='Key {} does not exist anymore. Aborting.'.format(
                             tweet_id),
                         level='warning')
            return
        # Change priority in queue
        self.pq.increment_priority(tweet_id)
        # remove from pqueue if below certain threshold
        score = self.pq.get_score(tweet_id)
        if score >= self.priority_threshold:
            self.remove(tweet_id)
            return
        # add user to set of tweet_id
        self.rset.add(tweet_id, user_id)
Beispiel #5
0
 def test_connection(self):
     test = self.redis_conn.ping()
     if test:
         self.logger.info('Successfully connected to Redis host {}'.format(app.config['REDIS_HOST']) )
     else:
         report_error(self.logger, msg='FAILURE: Connection to Redis host {} not successful'.format(app.config['REDIS_HOST']))
     return test
Beispiel #6
0
 def download_file(self, local_path, key):
     try:
         self._s3_client.download_file(self.bucket, key, local_path)
     except Exception as e:
         report_error(logger, exception=True)
         return False
     else:
         return True
def test_rollbar():
    d = {}
    try:
        d['missing_key']
    except:
        report_error(logger, exception=True)
    report_error(logger, msg='Test: report an arbitrary error message')
    return 'error reported'
Beispiel #8
0
 def upload_to_s3(self, content, key):
     try:
         self._s3_client.put_object(Body=content,
                                    Bucket=self.bucket,
                                    Key=key)
     except Exception as e:
         report_error(logger, exception=True)
         return False
     else:
         return True
Beispiel #9
0
 def file_exists(self, key):
     try:
         self._s3_client.head_object(Bucket=self.bucket, Key=key)
     except botocore.exceptions.ClientError as e:
         if e.response['Error']['Code'] == "404":
             return False
         else:
             report_error(logger, exception=True)
             return False
     else:
         return True
Beispiel #10
0
def parse_manifest(plugin):
    try:
        with open(u"plugins/manifests/" + plugin + u".manifest") as mf:
            mf_list = mf.readlines()
            sep = [i for i, s in enumerate(mf_list) if u"###" in s][0]
            desc = u"".join(mf_list[:sep]).rstrip()
            f_list = [line.strip() for line in mf_list[int(sep) + 2 :]]
            return (desc, f_list)
    except IOError as e:
        report_error(u"parse_manifest IOError", e)
        return (u"", [])
def remove_from_pq(project):
    """Remove a tweet which is now private"""
    data = request.get_json()
    logger.debug('Incoming request with data {}'.format(data))
    if data is None or 'tweet_id' not in data:
        report_error(logger, msg='No tweet_id was passed when updating')
        return Response(None, status=400, mimetype='text/plain')
    logger.info(f"Removing {data['tweet_id']} for project {project}")
    tid = TweetIdQueue(project)
    tid.remove(data['tweet_id'])
    return Response('Successfully removed.', status=200, mimetype='text/plain')
def add_to_pq(project):
    """Update priority score in queue and remember that a user has already classified a tweet"""
    data = request.get_json()
    logger.debug('Incoming request with data {}'.format(data))
    if data is None or 'user_id' not in data or 'tweet_id' not in data:
        report_error(logger, msg='No user_id was passed when updating ')
        return Response(None, status=400, mimetype='text/plain')
    logger.info(f"Adding {data['tweet_id']} to project {project} for user {data['user_id']}")
    tid = TweetIdQueue(project)
    tid.update(data['tweet_id'], data['user_id'])
    return Response('Update successful.', status=200, mimetype='text/plain')
 def test_connection(self):
     test = self._r.ping()
     if test:
         logger.info(
             f'Successfully connected to Redis host {self.host}:{self.port}'
         )
     else:
         report_error(
             logger,
             msg=
             'FAILURE: Connection to Redis host {self.host}:{self.port} not successful'
         )
     return test
Beispiel #14
0
 def test_connection(self):
     """test_connection"""
     test = self.es.ping()
     if test:
         logger.info(
             'Successfully connected to Elasticsearch host {}'.format(
                 self.config['ELASTICSEARCH_HOST']))
     else:
         report_error(
             logger,
             msg='Connection to Elasticsearch host {} not successful!'.
             format(self.config['ELASTICSEARCH_HOST']))
     return test
Beispiel #15
0
 def upload_file(self, local_path, key, make_public=False):
     extra_args = None
     if make_public:
         extra_args = {'ACL': 'public-read'}
     try:
         self._s3_client.upload_file(local_path,
                                     self.bucket,
                                     key,
                                     ExtraArgs=extra_args)
     except Exception as e:
         report_error(logger, exception=True)
         return False
     else:
         return True
Beispiel #16
0
 def put_template(self, template_path, template_name):
     """Put template to ES
     """
     # read template file
     if not os.path.exists(template_path):
         report_error(
             logger,
             msg='No project file found under {}'.format(template_path))
         return
     with open(template_path, 'r') as f:
         template = json.load(f)
     res = self.es.indices.put_template(template_name,
                                        body=template,
                                        include_type_name=True)
     logger.info("Template {} added to Elasticsearch".format(template_path))
 def on_error(self, status_code):
     if status_code in ERROR_CODES:
         msg = 'Error {}: {} {}'.format(
             status_code, ERROR_CODES[status_code]['text'],
             ERROR_CODES[status_code]['description'])
         report_error(self.logger, msg=msg)
         if status_code == 420:
             self.logger.info('Waiting for a bit...')
             self.rate_error_count += 1
             # wait at least 15min
             time.sleep(self.rate_error_count * 15 * 60)
         else:
             report_error(
                 self.logger,
                 msg='Received unknown error code {}'.format(status_code))
     return True  # To continue listening
Beispiel #18
0
 def bulk_actions_in_batches(self, actions, batch_size=1000):
     num_actions = len(actions)
     logger.info(f'Processing {num_actions:,} bulk actions...')
     for i in range(0, num_actions, batch_size):
         num_actions_in_batch = len(actions[i:(i + batch_size)])
         try:
             self.bulk_action(actions[i:(i + batch_size)])
         except:
             logger.error(
                 f'Elasticsearch failed to process batch of {num_actions_in_batch:,} actions'
             )
             report_error(logger, exception=True)
             return False
         else:
             logger.info(
                 f'Successfully processed batch of {num_actions_in_batch:,} actions'
             )
     return True
Beispiel #19
0
 def get(self, user_id=None):
     """Get new tweet ID to classify for user ID """
     # If no user is defined, simply pop the queue
     if user_id is None:
         tweet_id = self.pq.pop()
         if tweet_id is None:
             report_error(self.logger, msg='Queue is empty')
             return None
         else:
             return tweet_id
     else:
         tweet_id = self.retrieve_for_user(user_id)
         if tweet_id is None:
             report_error(
                 self.logger,
                 msg='No new tweet could be found for user_id {}'.format(
                     user_id))
         else:
             return tweet_id
Beispiel #20
0
    def index_tweet(self, tweet, index_name):
        """Index new tweet in index name given by tweet['project']. Will not re-index already existing doc with same ID.

        :tweet: tweet to index
        """
        try:
            self.es.index(index=index_name,
                          id=tweet['id'],
                          doc_type='tweet',
                          body=tweet,
                          op_type='create')
        except elasticsearch.ConflictError as e:
            # This usually happens when a document with the same ID already exists.
            logger.warning('Conflict Error')
        except elasticsearch.TransportError as e:
            report_error(logger, exception=True)
        else:
            logger.debug('Tweet with id {} sent to index {}'.format(
                tweet['id'], index_name))
Beispiel #21
0
def get_permissions():
    global installed
    try:
        permissions = []
        files = subprocess.check_output([u"ls", u"plugins"])
        files = files.decode(u'utf-8') #  to unicode string
        installed = [f for f in list(files.split(u"\n")) if re.match("[^_].+\.py$", f)]
        pm = installed.index(u"plugin_manager.py")
        del installed[pm]  #  Remove this plugin from list
        for p in installed:
            mod = subprocess.check_output([u"stat", u"-c %a", u"plugins/" + p])
            mod = mod.decode(u'utf-8')
            permissions.append(int(list(mod.strip())[1]) % 2)
        settings = dict(list(zip(installed, permissions)))
        return settings
    except IOError as e:
        report_error(u"get_permissions IOError", e)
        settings = {}
        return settings
def main():
    """Here we instantiate the stream manager, listener and connect to the Twitter streaming API."""
    global stream
    # setting things up...
    logger = logging.getLogger('stream')
    listener = Listener()
    auth = get_auth()
    # wait for a bit before connecting, in case container will be paused
    logger.debug('Streaming container is ready, sleeping for a bit...')
    time.sleep(10)
    time_last_error = 0
    error_count_last_hour = 0
    while run:
        logger.debug('Trying to connect to Twitter API...')
        stream = StreamManager(auth, listener)
        try:
            stream.start()
        except KeyboardInterrupt:
            sys.exit()
        except IncompleteRead:
            # This error occurrs sometimes under high volume, simply reconnect
            stream.stop()
        except (TweepError, ConnectionError, ConnectionResetError,
                ProtocolError) as e:
            stream.stop()
            report_error(logger, exception=True)
            error_count_last_hour = update_error_count(error_count_last_hour,
                                                       time_last_error)
            time_last_error = time.time()
        except Exception as e:
            stream.stop()
            report_error(logger,
                         msg='Uncaught stream exception.',
                         exception=True)
            error_count_last_hour = update_error_count(error_count_last_hour,
                                                       time_last_error)
            time_last_error = time.time()
        # if error_count_last_hour > 10:
        #     report_error(logger, msg='Failing to reconnect. Aborting.')
        #     sys.exit()
        wait_some_time(time_last_error, error_count_last_hour)
    logger.info('Shutting down...')
Beispiel #23
0
 def parse_dates(self,
                 *dates,
                 input_format='%Y-%m-%d %H:%M:%S',
                 output_format='%a %b %d %H:%M:%S %z %Y'):
     """Used to parse for Twitter's unusual created_at date format"""
     res = []
     for d in dates:
         if isinstance(d, str) and 'now' in d:
             res.append(d)
             continue
         try:
             d_date = datetime.strptime(d, input_format)
         except:
             report_error(
                 logger,
                 msg='Date {} is not of format {}. Using "now" instead'.
                 format(d, input_format))
             res.append('now')
         else:
             d_date = d_date.replace(tzinfo=timezone.utc)
             res.append(d_date.strftime(output_format))
     return res
 def sync(self):
     num_new_data = len(self)
     if num_new_data == 0:
         logger.info(f'No new data was collected. Aborting.')
         return
     logger.info(f'Writing {num_new_data:,} to file {self.data_dump_key}...')
     with open(self.local_file_tmp, 'w') as f:
         for chunk in self.pop_all_iter():
             chunk = list(set(chunk))
             if len(chunk) > 0:
                 f.write('\n'.join(chunk) + '\n')
     logger.info(f'Collected {num_new_data:,} ids')
     if self.s3_handler.file_exists(self.data_dump_key):
         success = self.download_existing_data_dump()
         if not success:
             logger.error(f'Something went wrong when trying to download the existing data. Aborting.')
             return
         # decompress data
         decompress(self.local_file_compr, self.local_file)
         os.remove(self.local_file_compr)
         # concatenating new data
         with open(self.local_file, 'a') as f:
             shutil.copyfileobj(open(self.local_file_tmp, 'r'), f)
     else:
         # There is no existing data, simply rename file
         os.rename(self.local_file_tmp, self.local_file)
     # reuploading file
     logger.info(f'Compressing file...')
     compress(self.local_file, self.local_file_compr)
     logger.info(f'Uploading file to S3 under key {self.data_dump_key}')
     success = self.s3_handler.upload_file(self.local_file_compr, self.data_dump_key, make_public=True)
     if not success:
         report_error(logger, msg='Uploading data dump Ids file to S3 unsuccessful.')
     # cleanup
     logger.info('Cleaning up temporary files...')
     for f in [self.local_file, self.local_file_tmp, self.local_file_compr]:
         if os.path.isfile(f):
             os.remove(f)
Beispiel #25
0
 def get_random_document(self, index_name, doc_type='tweet'):
     body = {
         'query': {
             'function_score': {
                 'functions': [{
                     'random_score': {}
                 }]
             }
         }
     }
     res = self.es.search(index=index_name,
                          doc_type=doc_type,
                          body=body,
                          size=1,
                          filter_path=['hits.hits'])
     hits = res['hits']['hits']
     if len(hits) == 0:
         report_error(
             logger,
             msg='Could not find a random document in index {}'.format(
                 index_name))
         return None
     return hits[0]['_source']
def get_new_tweet(project):
    """"Get new tweet from priority queue"""
    user_id = request.args.get('user_id', None)
    fields = request.args.get('fields', ['id', 'text'])
    logger.info(f"Getting tweet for project {project} for user {user_id}")
    tid = TweetIdQueue(project)
    tweet = tid.get_tweet(user_id=user_id)
    if tweet is None:
        msg = 'Could not get tweet id from priority queue. Getting random tweet from ES instead.'
        report_error(logger, msg=msg)
        # get a random tweet instead
        tweet = es.get_random_document(project)
        if tweet is None:
            msg = 'Could not get random tweet from elasticsearch.'
            report_error(logger, msg=msg)
            return jsonify({'error': msg}), 400
    tweet = {k: tweet.get(k) for k in fields}
    if 'id' in tweet:
        # rename fields
        tweet['tweet_id'] = str(tweet.pop('id'))
        tweet['tweet_text'] = tweet.pop('text')
    logger.info(f"Retrieving tweet {tweet['tweet_id']} for user {user_id}")
    return jsonify(tweet)
Beispiel #27
0
 def remove_lowest_priority(self, random_deletion=True):
     """Remove key with the lowest priority"""
     if not random_deletion:
         # Just delete lowest priority key
         items = self._r.zrevrange(self.key, 0, 0, withscores=True)
         num_deleted = self._r.zremrangebyrank(self.key, 0, 0)
         if num_deleted == 0:
             report_error(
                 self.logger,
                 msg=
                 'Tried to remove lowest ranking element but queue is empty.',
                 level='warning')
         return items
     # Remove a random lowest priority key
     items = self._r.zrevrange(self.key, 0, 0, withscores=True)
     if len(items) == 0:
         report_error(
             self.logger,
             msg=
             'Tried to remove lowest ranking element but queue is empty.',
             level='warning')
         return
     lowest_score = items[0][1]
     num_elements = self._r.zcount(self.key, lowest_score, lowest_score)
     if num_elements == 0:
         msg = 'Element with score {} could not be found. Possibly it has been removed before. Aborting.'.format(
             lowest_score)
         self.logger.warning(msg)
         return
     elif num_elements == 1:
         items = self._r.zrevrange(self.key, 0, 0, withscores=True)
         self._r.zremrangebyrank(self.key, 0, 0)
     else:
         # multiple elements with the same lowest score, randomly remove one
         rand_index = random.randint(0, num_elements - 1)
         self.logger.debug(
             'Picked {} as randindex between {} and {}'.format(
                 rand_index, 0, num_elements - 1))
         items = self._r.zrange(self.key,
                                rand_index,
                                rand_index,
                                withscores=True)
         num_deleted = self._r.zremrangebyrank(self.key, rand_index,
                                               rand_index)
         if num_deleted != 1:
             report_error(
                 self.logger,
                 msg=
                 'Random key could not be deleted because it does not exist anymore'
             )
     return items
Beispiel #28
0
 def remove(self, item):
     """Remove key by keyname"""
     if self._r.zrem(self.key, item) == 0:
         report_error(self.logger,
                      msg='Element {} could not be deleted'.format(
                          item.decode()))
 def on_warning(self, notice):
     report_error(self.logger, msg=notice, level='warning')
 def on_timeout(self):
     report_error(self.logger, msg='Stream listener has timed out')
     return True  # To continue listening