def count_reactions(client): # legacy_reactions = get_legacy_reactions() database = DataBase() for user in Session().query(SlackUser).all(): reactions_response = client.reactions_list(user=user.id) items = reactions_response.data['items'] for item in items: channel = item['channel'] message = item['message'] ts = message['ts'] author = message['user'] reactions = message['reactions'] # if user.id in legacy_reactions.keys() and ts == legacy_reactions[user.id]: # break for reaction in reactions: if user.id in reaction['users']: reaction_code = reaction['name'].split(':')[0] if database.select(Reaction, channel=channel, timestamp=ts, sender=user.id, receiver=author, name=reaction_code) is None: database.insert( Reaction(channel, ts, reaction_code, user.id, author))
def start(): database = DataBase() with open(current_path + '/course_list.csv', 'r') as file: reader = csv.DictReader(file) for row in reader: course = Course(row['name'], row['tag'], row['url'], True) database.insert(course) with open('course_list.csv', 'w') as file: file.write('')
def structure_message(notification): database = DataBase() author = database.select(Author, id=notification.author) output_string = '>*' + notification.title + '*\n>\n>' output_string += str(notification.text).replace('\n', '\n>') + '\n>\n>' output_string += '*' + author.first_name + ' ' + author.last_name + '*' + ' ' * 10 + parse_date( str(notification.publish_date)) + '\n\n' output_string += '[src: ' + notification.link + ']' return output_string
def command_endpoint(): database = DataBase() data, data_ok = SlackCommandUtils.read_data(request, text_tokens_length=1) if not data_ok: return 'Invalid request format', 400 command = data['text'] if command != 'start' and command != 'stop': return 'Invalid request format', 400 response = requests.get(default_protocol + '://' + config.get('flask_address') + ':' + config.get('flask_port') + '/ui/home/scraper/' + command) success_flag = response.status_code == 500 slack_command_log = SlackCommandUtils.create_slack_command_log(data, success_flag) database.insert(slack_command_log) if response.status_code == 200: return '', 200 else: return 'Error has occurred while commanding scraper', 500
def __init__(self, link: str, payload: dict, **kwargs): self.database = DataBase() self.progress_queue_manager = ProgressQueueManager() self.logger = Logger() self.link = link self.payload = payload self.date_format = { 'small': '%Y-%m-%dT%H:%M', 'medium': '%Y-%m-%dCET%H:%M', 'large': '%Y-%m-%dCEST%H:%M' } self.headers = {'User-Agent': 'Mozilla/5.0'} self.html_parser = 'html.parser' if kwargs and 'headers' in kwargs: self.headers = kwargs['headers'] if kwargs and 'html_parser' in kwargs: self.html_parser = kwargs['html_parser']
def get_reaction_table(): database = DataBase() data, data_ok = SlackCommandUtils.read_data(request=request, text_tokens_length=2) try: reaction_name = data['text'][1][1:-1] if not data_ok: return 'Invalid request format', 400 if data['text'][0] == 'r': result = reaction_manager.get_top_receivers(reaction_name=reaction_name) elif data['text'][0] == 's': result = reaction_manager.get_top_senders(reaction_name=reaction_name) else: return 'Invalid request format', 400 response = {'response_type': 'in_channel'} response_text = 'Here is top chart for :' + reaction_name + ':\n\n' for i, info in enumerate(result): response_text += str(i + 1) + '. <@' + database.select(SlackUser, name=info[0]).id + '> with ' + str(info[1]) + ' total ' if data['text'][0] == 'r': response_text += 'received' elif data['text'][0] == 's': response_text += 'sent' response_text += ' reactions\n' response['text'] = response_text slack_command_log = SlackCommandUtils.create_slack_command_log(data, True) database.insert(slack_command_log) return jsonify(response), 200 except Exception as e: slack_command_log = SlackCommandUtils.create_slack_command_log(data, False) database.insert(slack_command_log) return '', 500
def archive_course(): database = DataBase() data, data_ok = SlackCommandUtils.read_data(request=request, text_tokens_length=1) if not data_ok: return 'Invalid request format', 400 response = requests.post(default_protocol + '://' + config.get('flask_address') + ':' + config.get('flask_port') + '/ui/channel/archive?tag=' + data['text'][1:]) success_flag = response.status_code == 200 slack_command_log = SlackCommandUtils.create_slack_command_log( data, success_flag) database.insert(slack_command_log) if success_flag: return '', 200 else: return 'Error has occurred while archiving the channel', 500
from flask import Blueprint, Response from src.main import client from src.main.objects.scanner import Scanner from src.models.base import DataBase from src import Logger app_nav_bar = Blueprint('app_nav_bar', __name__, template_folder='templates') scanner = Scanner(client, DataBase()) logger = Logger() @app_nav_bar.route('/scan/reactions', methods=['GET']) def scan_reactions(): try: scanner.scan_reactions() return Response(status=200) except Exception as e: logger.error_log(e) return Response(status=500) @app_nav_bar.route('/scan/users', methods=['GET']) def scan_users(): try: scanner.scan_users() return Response(status=200) except Exception as e: logger.error_log(e) return Response(status=500)
def main(): logger.info_log('Started populating database.') database = DataBase() client = slack.WebClient(token=sys.argv[1]) response = client.users_list() # insert users for user in response['members']: if database.select(SlackUser, id=user['id']) is None: database.insert(SlackUser(user['id'], user['name'])) # insert channels response = client.conversations_list() for channel in response['channels']: date_created = datetime.fromtimestamp(channel['created']) author_id = channel['creator'] if database.select(Channel, id=channel['id']) is None: database.insert( Channel(channel['id'], '#' + channel['name'], author_id, date_created)) # insert courses courses = [('Skriptni jezici', '#skriptni'), ('Umjetna inteligencija', '#ai'), ('Statistička analiza podataka', '#sap'), ('Trgovačko pravo', '#pravo'), ('Interaktivna računalna grafika', '#irg'), ('Završni rad', '#završni'), ('Napredno korištenje operacijskog sustava Linuxi', '#linux')] for course in courses: if database.select(Course, name=course[0], channel_tag=course[1]) is None: database.insert(Course(*course)) # insert reactions response = client.users_list() count_reactions(client) # if there is legacy file with reactions insert legacy reactions (w/o channel and timestamp attribute) if len(sys.argv) == 3: count_legacy_reactions(response, database, sys.argv[2]) logger.info_log('Ended populating database.')
from flask import Blueprint, render_template, Response, request, redirect, url_for from src.main import client from src.main.objects.reaction_scrapper import ReactionScrapper from src.models.base import DataBase, Session from src import Logger from src.main.objects.reaction_manager import ReactionManager import re app_reaction = Blueprint('app_reaction', __name__, template_folder='templates') logger = Logger() reaction_scrapper = ReactionScrapper(client, DataBase(), logger) reaction_manager = ReactionManager(logger) @app_reaction.route('/ui/reaction/<name>', methods=['GET', 'POST']) def get_reactions(name): senders, receivers, top_channels, latest_reactions = reaction_manager.get_top_all(reaction_name=name) return render_template('reaction.html', senders=senders, receivers=receivers, top_channels=top_channels, name=name, latest_reactions=latest_reactions, alive=reaction_scrapper.is_alive()), 200 @app_reaction.route('/ui/reaction/scan', methods=['GET']) def scan_reactions(): try: reaction_scrapper.count() return Response(status=200) except Exception as e: logger.error_log(e) return Response(status=500)
from flask import Blueprint, render_template, request, Response from src import Logger from src.main import client from src.main.objects.reminder_manager import ReminderManager from src.models.base import DataBase from src.web_app.services import reminder_service app_reminder = Blueprint('app_reminder', __name__, template_folder='templates') logger = Logger() reminder_manager = ReminderManager(client, DataBase(), logger) # used for all interactions with database @app_reminder.route('/ui/reminder', methods=['GET']) def get_reminders(): """ default mapping with all reminders """ reminders = reminder_manager.get_reminders() courses, authors = reminder_manager.get_filter_options() return render_template('reminder.html', courses=courses, authors=authors, reminders=reminders), 200 @app_reminder.route('/ui/reminder/filter', methods=['POST']) def filter_reminders(): """ mapping for returning only filtered reminders params: name : name of the course author : first and last name of the author from : all dates after it to : all dates after it
def start_scraper_process(): progress_queue.put((NONE_PROGRESS, 'Starting scraper')) database = DataBase() logger.info_log('Program started.') # refresh_active_courses.start() # count_reactions(client) timeout = 600 try: loop_count = 0 courses = database.select_many(Course, watch=True) while True: check_pins(client, logger) check_reminders(client, logger) notifications = scraper.start(courses) print('Scraping phase done.') # TODO catch exception do in main if notifications is None: progress_queue.put((INIT_PROGRESS + SCRAPE_PROGRESS), None, 'warning') timeout *= 2 time.sleep(min(timeout, 2400)) notifications = [] else: timeout = 600 for i in range(len(notifications)): notification = notifications[i] result = database.select(Notification, title=notification.title, site=notification.site, author=notification.author, publish_date=notification.publish_date, text=notification.text, link=notification.link) if result is None: fresh_notification = notification.publish_date + timedelta(hours=24) >= datetime.now() course = database.select(Course, id=notification.site) if check_filters(notification) and fresh_notification: channel = database.select(Channel, tag=course.channel_tag) # check if course.channel_tag is enough (in place of channel.id) response = client.chat_postMessage(channel=channel.id, text=structure_message(notification)) database.insert(notification) if check_filters(notification) and fresh_notification: for reminder in generate_reminders(notification): database.insert(reminder) # https://api.slack.com/methods/pins.add client.pins_add(channel=response['channel'], timestamp=response['ts']) database.insert(Pin(datetime.now(), timedelta(hours=24), response['channel'], response['ts'])) progress_queue.put((INIT_PROGRESS + SCRAPE_PROGRESS + int(SAVE_PROGRESS/(len(notifications))) * (i+1), None)) gc.collect() if loop_count == 10: count_reactions(client) courses = database.select_many(Course, watch=True) loop_count = 0 progress_queue.put((DONE_PROGRESS, 'Scraping done', 'sleep')) time.sleep(60) progress_queue.put((NONE_PROGRESS, 'Starting scraper')) except Exception as e: logger.error_log(e) finally: error_channel = '#random' client.chat_postMessage(channel=error_channel, text='I am dead.') logger.info_log('Program finished with exit code 1.') progress_queue.put((NONE_PROGRESS, 'Program finished with exit code 1', 'error')) exit(1)
class WebScraper: """ A class that scrapes FER Web for notifications Attributes ---------- database: DataBase a object that is responsible for communicating with database progress_queue_manager: ProgressQueueManager a object that is responsible for keeping track of progress made while scraping notifications logger : Logger a object that is saving scanner logs to a predefined file link : str a link to FER web page from which data will be scrapped payload: dict a dictionary that contains payload (like username and password) for logging into target website date_format: dict a dictionary that contains date formats found in web notifications headers: dict optional dictionary that is sent as HTTP header when scraping html_parser: str optional parameter that defines which html parser to use when parsing scraped raw HTML Methods ------- start(courses: list[Course]) -> list[Notification] starts wrapped process of scraping notifications for target courses generate_notifications(courses: list[Course]) -> list[Notification] scraping notifications for target courses _check_author(author_string: str) -> Author checks for author and saves it to database if author doesn't exists _check_date(date_elements: list) returns most recent date object based on date_format _check_text(text: str) -> str parsing text for Slack message """ def __init__(self, link: str, payload: dict, **kwargs): self.database = DataBase() self.progress_queue_manager = ProgressQueueManager() self.logger = Logger() self.link = link self.payload = payload self.date_format = { 'small': '%Y-%m-%dT%H:%M', 'medium': '%Y-%m-%dCET%H:%M', 'large': '%Y-%m-%dCEST%H:%M' } self.headers = {'User-Agent': 'Mozilla/5.0'} self.html_parser = 'html.parser' if kwargs and 'headers' in kwargs: self.headers = kwargs['headers'] if kwargs and 'html_parser' in kwargs: self.html_parser = kwargs['html_parser'] def start(self, courses): notifications = [] try: notifications = self.generate_notifications(courses) except errors.LoginError as e: notifications = None self.logger.warning_log(e.text) except Exception as e: notifications = [] self.logger.error_log( e, text='Error has occurred while scraping notifications ') finally: return notifications def generate_notifications(self, courses): notifications = [] session = requests.Session() intranet = session.post(self.link + '/login', headers=self.headers, data=self.payload) soup = BeautifulSoup(intranet.text, self.html_parser) check_element = soup.find('li', {'class': 'home-page'}) if not check_element.text.__contains__('Intranet'): raise errors.LoginError self.progress_queue_manager.init_phase() for i, course in enumerate(courses): time.sleep(2) self.progress_queue_manager.scraping_course_start_info(course) link = course.url + '/obavijesti' raw_html = session.get('https://' + link, headers=self.headers, data=self.payload).text soup = BeautifulSoup(raw_html, self.html_parser) news_articles = soup.findAll('div', {'class': 'news_article'}) for j, news_article in enumerate(news_articles): title_element = news_article.find('div', {'class': 'news_title'}) notification = Notification() notification.link = self.link + title_element.a['href'] notification.title = title_element.get_text().strip() notification.site = self.database.select(Course, name=course.name).id notification.author = self._check_author( news_article.find('span', { 'class': 'author_name' }).get_text().strip()).id notification.publish_date = self._check_date( news_article.findAll('time')) notification.text = self._check_text( str(news_article.find('div', {'class': 'news_lead'})).replace( '<p>', '').replace('</p>', '\n')) notifications.append(notification) self.progress_queue_manager.scrape_phase( course, i, j, len(courses), len(news_articles)) self.progress_queue_manager.scraping_course_done_info( course, i, len(courses)) session.get(self.link + '/login/Logout?logout=1') return notifications def _check_author(self, author_string: str) -> Author: author_name_list = author_string.split() author = self.database.select(Author, first_name=author_name_list[0], last_name=' '.join(author_name_list[1:])) if author is None: self.database.insert( Author(author_name_list[0], ' '.join(author_name_list[1:]))) author = self.database.select(Author, first_name=author_name_list[0], last_name=' '.join( author_name_list[1:])) return author def _check_date(self, date_elements: list): date = None for date_element in date_elements: element = date_element['datetime'] if len(element) == 16: iteration_date = datetime.strptime(element, self.date_format['small']) elif len(element) == 18: iteration_date = datetime.strptime(element, self.date_format['medium']) else: iteration_date = datetime.strptime(element, self.date_format['large']) if date is None: date = iteration_date elif date < iteration_date: # if date was edited get most recent date date = iteration_date return date # https://api.slack.com/reference/surfaces/formatting def _check_text(self, text: str) -> str: # getting only text from html text = '\n'.join(text.split('\n')[1:-1]).strip() # replacing HTML spacing text = text.replace(u"\u00A0", " ") # closing right gaps text = re.sub(r'([0-9A-z:;,"\').]) +(</)', r'\g<1>\g<2>', text) # closing left gaps text = re.sub(r'(<[^>]+>) +([^ ])', r'\g<1>\g<2>', text) # expanding right gaps text = re.sub(r'(</[^>]+>)([^ :;,"\').])', r'\g<1> \g<2>', text) # expanding left gaps text = re.sub(r'([^ :;"\'(.])(<[^/>]+>)', r'\g<1> \g<2>', text) # replacing bold and italic html tags for slack tags text = text.replace('<strong>', '*').replace('</strong>', '*').replace( '<em>', '_').replace('</em>', '_') # parsing links link_groups: list[LinkHelper] = LinkHelper.create_list( re.findall(r'(<a href=\"([^<> ]*)\"[^<>]*>([^<>]*)</a>)', text)) for link_group in link_groups: if 'http' in link_group.target: text = re.sub( re.escape(link_group.target), '<' + link_group.link + '|' + link_group.text + '>', text) else: mail_elements = re.search( r"javascript:cms_mail\(\'([^,]*)\',\'([^,]*)\'.*\)", link_group.link) text = re.sub( re.escape(link_group.target), '<mailto:' + mail_elements.group(1) + '@' + mail_elements.group(2) + '|' + link_group.text + '>', text) # workaround for Slack not parsing list as expected text = text.replace('<li>', '<li>• ') # removing extra spaces text = re.sub(r' +', r' ', text) # parsing the rest of text return BeautifulSoup(text, self.html_parser).get_text()