def bind_news(self, url: str): # Retrieve id from news using url query = "SELECT id FROM news " \ "WHERE url = '{url}'" \ .format(url=url) result = dbconnection.select(query) try: news_id = result[0][0] except: raise exceptions.UnexpectedBehavior() # Check if news and search already is binded query = "SELECT * FROM search_has_news " \ "WHERE search_id = {search_id} " \ "AND news_id = {news_id}" \ .format(search_id=self.id, news_id=news_id) result = dbconnection.select(query) if result: raise exceptions.InfoAlreadyBinded() # Record data in database query = "INSERT INTO search_has_news " \ "VALUES ({search_id}, {news_id})" \ .format(search_id=self.id, news_id=news_id) dbconnection.modify(query)
def __add_base_html(self, source: str): # Query for correct encoding query = "SELECT domain.encoding from domain " \ "JOIN section ON domain.url = section.domain_url " \ "JOIN structure ON section.url = structure.section_url " \ "WHERE structure.id = {id} " \ "LIMIT 1" \ .format(id=self.id) result = dbconnection.select(query) if result: encoding = result[0][0] else: raise exceptions.IncorrectQuery() # Build specific head for page head = "<head>\n" \ "<style>\n" \ ".main-content {{text-align: justify; text-indent: 50px;}}\n" \ ".caption {{text-align: center;}}\n" \ "img {{display: block; margin: 0 auto; width: 400px;}}\n" \ "</style>\n" \ "<meta charset='{encoding}'>\n" \ "<head>\n" \ .format(encoding=encoding) # Build specific body for page body_begin = "<body>\n" body_end = "</body>\n" # Merge head, body and source source = head + body_begin + source + body_end return source
def __find_section_url(self, url: str, domain_url: str) -> str: # Load section url section_regex = "//(.*?/.*?)/" section_url = re.search(section_regex, url) if section_url: section_url = section_url.groups()[0] else: section_url = None # Load normal section url nsection_regex = "//(.*?)/(.*?)/" nsection_url = re.search(nsection_regex, url) if nsection_url: nsection_url = nsection_url.groups()[0] else: nsection_url = None # Make query query = "SELECT url FROM section " \ "WHERE url IN (" if section_url: query = query + "\"" + section_url + "\", " if nsection_url: query = query + "\"" + nsection_url + "\", " query = query + "\"" + domain_url + "\") " query += "ORDER BY importance DESC" result = dbconnection.select(query) if result: url = result[0][0] else: raise exceptions.UnsupportedURL() return url
def load_domain(self): # Retrieve structures from database domain = Domain query = "SELECT * FROM domain " \ "JOIN section " \ "ON domain.url = section.domain_url " \ "WHERE section.url = '{url}' " \ "LIMIT 1;" \ .format(url=self.url) result = dbconnection.select(query) # Load corretly domain for row in result: if row[1] == "Elmundo": domain = DomainElmundo() elif row[1] == "Uol": domain = DomainUol() elif row[1] == "Globo": domain = DomainGlobo() else: raise exceptions.UnsupportedDomain() if not domain: raise exceptions.UnsupportedSection() self.domain = domain
def news_in_database(self, url: str) -> int: query = "SELECT id FROM news " \ "WHERE url = '{url}'" \ .format(url=url) result = dbconnection.select(query) if result: return True else: return False
def load_structures(self): """ Create and load pages objects from database """ # Retrieve structures from database query = "SELECT * FROM structure " \ "JOIN section " \ "ON section.url = structure.section_url " \ "WHERE section.url = '" + self.url + "';" result = dbconnection.select(query) # Instance and append each structure to list for row in result: structure = Structure(row=row) self.structures.append(structure)
def __init__(self): super(DomainElmundo, self).__init__() query = "SELECT * FROM domain " \ "WHERE name = 'Elmundo'" result = dbconnection.select(query) for row in result: self.url = row[0] self.name = row[1] self.connection_timeout = row[2] self.connection_wait = row[3] self.connection_attempts = row[4] self.connection_agent = row[5] self.encoding = row[6]
def __storage(self): # Check latest folder on news folder query = "SELECT id FROM news " \ "ORDER BY id DESC " \ "LIMIT 1" result = dbconnection.select(query) if result: num_folder = int(result[0][0]) + 1 else: num_folder = 1 # Check if folder exists destiny_folder = news_dir + str(num_folder) + "\\" print(destiny_folder) if not os.path.exists(destiny_folder): os.makedirs(destiny_folder) else: utils.clear_folder(destiny_folder) # Move temporary files to new folder utils.move_all_folder(temp_dir, destiny_folder) self.dir_html = "news\\" + str(num_folder) + "\\"