class Producer: def __init__(self, config): self._properties = self._get_properties(config) self._airflow = config['airflow'] self._db = DatabaseConnector(config) def produce_to_kafka(self, data): topic = self._properties['producer']['topic'] producer = KafkaProducer(**self._properties['producer']) # todo: add "locally" logging -> "Sending data to kafka.." f = producer.send(topic=topic, value=data) \ .add_callback(self._on_success, **{'data': data}) \ .add_errback(self._on_failure, **{'data': data}) def run(self): pass def _on_success(self, metadata, data=None) -> None: # todo: add "locally" logging -> f"Successfully sent message: {metadata}" self._db.log_to_db('INFO', data, '') def _on_failure(self, exception, data=None) -> None: # todo: add "locally" logging -> f"Error while attempting to send message to kafka: {ex}" self._db.log_to_db('ERROR', data, str(exception)) @staticmethod def _get_properties(config): properties = { 'producer': { 'topics': config['kafka']['topic'], 'bootstrap_servers': config['kafka']['host'], 'key_serializer': None, 'value_serializer': None, 'acks': 'all' }, } if config['kafka']['protocol'].lower() == 'ssl': properties['common_client'] = 'SSL', properties['ssl'] = { 'ssl_endpoint_identification_algorithm': '', 'ssl_truststore_location': config['kafka']['truststoreLocation'], 'ssl_keystore_location': config['kafka']['keystoreLocation'], 'ssl_truststore_password': config['kafka']['truststorePassword'], 'ssl_keystore_password': config['kafka']['keystorePassword'], 'ssl_key_password': config['kafka']['keyPassword'] } return properties
import requests, time, os, math, threading from datetime import datetime, timedelta, timezone, time as dt_time from dateutil import parser from dotenv import load_dotenv from team import Team from image import build_image from twitter import TwitterClient from sms import SMSClient from db import DatabaseConnector from prediction import Prediction from utils import headers twitter_client = TwitterClient() sms_client = SMSClient() db_connector = DatabaseConnector() load_dotenv() dirname = os.path.dirname(__file__) LEAGUE_ID = os.getenv("LEAGUE_ID") LEAGUE_ID_PREV = os.getenv("LEAGUE_ID_PREV") def get_utc_timestamp(): return int(datetime.utcnow().timestamp()) def current_utc_day(): return datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) def seconds_to(time):
def __init__(self, person, init_page): self.person = person self.init_soup = self.request_init_page(init_page) self.db = DatabaseConnector(Config.DATABASE)
class PortalScraper(): def __init__(self, person, init_page): self.person = person self.init_soup = self.request_init_page(init_page) self.db = DatabaseConnector(Config.DATABASE) def request_init_page(self, init_page): # request portal.fo res = requests.get(init_page) #read text and using bs4 html parser soup = BeautifulSoup(res.text, 'html.parser') return soup def get_person_article_links(self): """ Reads through the init page (http://portal.fo/seinastu+vidmerkingarnar.html) And gets every comment of search person and stores the link to article in which the comment was given in a Set. Returns: Set if page is changed, and None if nothing has changed. """ comment_items = self.init_soup.find_all("div", class_="comment_item") hash_checker = HashChecker() hash_checker.generate_hash(comment_items) # Will compare with the hash saved in prev_hash.txt page_has_changed = hash_checker.hash_compare() if page_has_changed: hash_checker.save_new_hash() search_person_article_links = set() for comment in comment_items: commenter_name = comment.find( class_="comment_profilename").text if (commenter_name == self.person): search_person_article_links.add(comment.parent.get('href')) return search_person_article_links else: return None def scrape_articles(self, articles): # test = 0 for article in articles: # test_file = "test-" + str(test) + ".txt" comment_section_soup = self.get_comment_section(article) self.extract_comment_data(comment_section_soup, article) ''' with open(test_file, "w", encoding="utf-8") as fo: fo.write(str(comment_section_soup.prettify())) test = test + 1 ''' def get_comment_section(self, article): """ -- This method is only meant to be used in this file -- The Facebook Comments Plugin is loaded with Javascript, so we can't use the request module to read the articles, because it only gets static server HTML. This method uses Selenium, so we can wait for the plugin to have been loaded Returns: Soup for each article comment section (BeautifulSoup object) """ driver = Config.get_driver() driver.get(article) timeout = 10 try: # First we have to wait until the page is fully loaded. Using selenium and WebDriverWait to do that # Facebook Comments plugin is loaded via Javascript, so we cant use the request module to simply read the page element_present = EC.presence_of_element_located( (By.CLASS_NAME, 'fb_iframe_widget')) WebDriverWait(driver, timeout).until(element_present) # wait for fb_iframe_widget_loader to disappear self.wait_until_disappeared(driver, 'fb_iframe_widget_loader') # Now the Facebook plugin has been loaded # First get innerHTML of the page and use BeautifulSoup HTML parser so that we can work with it innerHTML = driver.execute_script( "return document.body.innerHTML" ) #returns the inner HTML as a string soup_comments = BeautifulSoup(innerHTML, 'html.parser') # This is the Facebook comments plugin which is an iframe facebook_plugin_iframe = soup_comments.find('iframe', class_="fb_ltr") frame_id = facebook_plugin_iframe.get('id') # Because we need to work with another iframe, we need to change the frame # First set the current frame of the driver to the default # Then switch to iframe with the id we got from the Facebook comments plugin (line 29) # Then get innerHTML of the iframe and use BeautifulSoup so that we can work with it driver.switch_to_default_content() driver.switch_to.frame(frame_id) self.press_load_more_comments_if_present(driver) self.press_open_replies_if_present(driver) iframe_innerhtml = driver.execute_script( "return document.body.innerHTML" ) #returns the inner HTML as a string iframe_soup = BeautifulSoup(iframe_innerhtml, 'html.parser') return iframe_soup except TimeoutException: print("Timed out waiting for page to load") def wait_until_disappeared(self, driver, element): timeout = 10 try: element = WebDriverWait(driver, timeout).until( EC.invisibility_of_element_located((By.CLASS_NAME, element))) except TimeoutException: print("Timed out waiting for element to disappear") def press_load_more_comments_if_present(self, driver): load_more_buttons = driver.find_elements_by_xpath( "//*[contains(text(), 'more comments')]") for load_button in load_more_buttons: # Navigate one level up to the anchor tag driver.execute_script("arguments[0].scrollIntoView();", load_button) load_button.click() def press_open_replies_if_present(self, driver): """ -- This method is only meant to be used in this file -- """ span_show_more_replies = driver.find_elements_by_xpath( "//*[contains(text(), 'more replies in this thread') or contains(text(), 'more reply in this thread')]" ) for span_tag in span_show_more_replies: # Navigate one level up to the anchor tag anchor_clickable = span_tag.find_element_by_xpath('..') driver.execute_script("arguments[0].scrollIntoView();", anchor_clickable) anchor_clickable.click() # Wait until all loading spans are gone. # The presence of them means that the plugin is loading the comments timeout = 10 try: element = WebDriverWait(driver, timeout).until( EC.invisibility_of_element_located( (By.XPATH, "//span[@aria-valuetext='Loading...']"))) except TimeoutException: print("Timed out waiting for element to disappear") def extract_comment_data(self, comment_section_soup, article): comment_divs = comment_section_soup.find_all( class_='UFICommentActorName') for comment_div in comment_divs: # Get commenter name and compare it with the person we are searching for commenter_name = comment_div.text if (commenter_name == self.person): print('This is ', self.person) person_dict = {} # Traverse to parent span, so that we can traverse to the other divs from here # PARENT parent_span = comment_div.parent # GO TO TOP SIBLING OF PARENT # Go to the next sibling of the parent span. This is where the comment is located comment_sibling_div = parent_span.find_next_sibling() # print(comment_sibling_div) comment_text = comment_sibling_div.text # GO TO TOP SIBLING OF COMMENT_SIBLING # Div that contains lin to comment and time of comment like_time_sibling_div = comment_sibling_div.find_next_sibling() # print('Hey', like_time_sibling_div.prettify()) # Check if the i tag exists. Then there are likes likes = '' for child in like_time_sibling_div.children: itag = child.find('i') if itag: likes = child.text comment_utime = like_time_sibling_div.find( "abbr", { "class": "UFISutroCommentTimestamp" }).get('data-utime') comment_timestamp = self.utime_to_timespamp(comment_utime) person_dict['name'] = commenter_name person_dict['text'] = comment_text person_dict['article'] = article person_dict['likes'] = likes person_dict['comment_timestamp'] = comment_timestamp self.db.insert_comment(person_dict) def utime_to_timespamp(self, utime): return datetime.datetime.fromtimestamp( int(utime)).strftime('%Y-%m-%d %H:%M:%S') def __repr__(self): return "Search person: %s" % (self.init_soup)
def __init__(self, config): self._airflow = config['airflow'] self._properties = self._get_properties(config) self._db = DatabaseConnector(config)
class Consumer: def __init__(self, config): self._airflow = config['airflow'] self._properties = self._get_properties(config) self._db = DatabaseConnector(config) def run(self) -> None: self._consume_from_kafka() def _consume_from_kafka(self) -> None: # todo: handle ssl parameters consumer = KafkaConsumer(**self._properties['consumer']) # todo: logging consumer initialization for record in consumer: print(f'Got next message: {record}') # todo: logging "locally record.value input" self._db.log_to_db('INFO', record.value, '') self.http_post(record.value) def http_post(self, data: str) -> None: # todo: logging "locally" -> f'Triggering DAG with the following data: {data}' dag_name = self._db.get_config_from_db(self.get_message_type(data=data)) airflow_url = f'{self._airflow}/api/experimental/dags/{dag_name}/dag_runs' response = requests.post( url=airflow_url, data={}, headers={ 'Content-Type': 'application/json', 'Cache-Control': 'no-cache' }) if response.ok: # todo: logging "locally" -> "Triggered Airflow DAG successfully" self._db.log_to_db('INFO', data, '') else: # todo: logging "locally" -> "Error while triggering Airflow DAG" self._db.log_to_db('ERROR', data, 'Error while triggering Airflow DAG') # todo: check @staticmethod def get_message_type(data: str) -> str: message_type = json.loads()['message_type'] # todo: add "locally" logging -> f"Got message_type from kafka: {message_type}" return message_type @staticmethod def _get_properties(config): properties = { 'consumer': { 'topics': config['kafka']['topic'], 'bootstrap_servers': config['kafka']['host'], 'group_id': config['kafka']['groupid'], 'key_deserializer': None, 'value_deserializer': None, 'enable_auto_commit': True, 'auto_offset_reset': 'latest' }, } if config['kafka']['protocol'].lower() == 'ssl': properties['common_client'] = 'SSL', properties['ssl'] = { 'ssl_endpoint_identification_algorithm': '', 'ssl_truststore_location': config['kafka']['truststoreLocation'], 'ssl_keystore_location': config['kafka']['keystoreLocation'], 'ssl_truststore_password': config['kafka']['truststorePassword'], 'ssl_keystore_password': config['kafka']['keystorePassword'], 'ssl_key_password': config['kafka']['keyPassword'] } return properties
}, "password": { "type": "string", }, "spotifyUsername": { "type": "string", }, }, "required": ["email", "password"], "additionalProperties": False } # Create instance of DatabaseConnector databaseConnection = DatabaseConnector.DatabaseConnector() # Create the Flask application and tell it where to look to serve HTML files application = Flask(__name__, template_folder='react-frontend/templates', static_folder='react-frontend/static') # Prepare the mongo instance application.config["MONGO_URI"] = databaseConnection.getURI() application.config['JWT_ACCESS_TOKEN_EXPIRES'] = datetime.timedelta(days=1) application.config['PROPAGATE_EXCEPTIONS'] = True application.config['SECRET_KEY'] = "'\xe9\xa5'" # Create the Mongo object with our Flask application mongo = PyMongo(application) flask_bcrypt = Bcrypt(application) jwt = JWTManager(application)