def main(): """Manages overall detection process, from beginning to end. This includes: - Read settings - Initialize processing - Error handling""" # read settings. No file passed implies default settings settings = SettingReader.SettingReader(None) # print settings.values['Global']['startingpoint'] # initialize processing current_position = initialize(settings) # deduce steps that will have to be executed steps_to_execute = structure[structure.index(current_position):] # execute steps for step in steps_to_execute: helpers.write_to_log(settings=settings, line=step) print '### ' + step r = globals()[step[3:]](settings, structure, debug) if r != 0: break return 0
def get_html_content_by_link(link): print("HERE", link) try: response = urllib.request.urlopen(link) return response.read() except Exception as e: helpers.write_to_log("Catch from parser.get_html_content_by_link()\n" + str(e))
def get_json_content(html, article): try: soup = bs4.BeautifulSoup(html, features='lxml') all_jsons = soup.find_all('script', type='application/ld+json') for item in all_jsons: # convert data from string to json item = json.loads(item.text) # NewsArticle contain title, description, author image list, if item['@type'] == "NewsArticle": article['title'] = item['headline'] article['description'] = item['description'] article['keywords'] = item['keywords'] article['publisher'] = item['publisher']['name'] article['original_link'] = item['mainEntityOfPage']['@id'] article['images'] = item['image'].copy() elif item['@type'] == "VideoObject": if article['video'] is None: article['video'] = [] if item['uploadDate'] is not None and item['uploadDate'] != "": article['video'].append({ 'url': get_video_link(item['contentUrl'], item['embedUrl']), 'upload_date': datetime.datetime.strptime(item['uploadDate'], '%Y-%m-%dT%H:%M:%SZ') }) else: article['video'].append({ 'url': get_video_link(item['contentUrl'], item['embedUrl']), 'upload_date': datetime.datetime.today() }) else: continue # if video more than one needs to sort it by upload date if len(article['video']) > 1: # create helper list with sort video article['sort_video'] = sorted(article['video'], key=lambda k: k['upload_date'], reverse=False) # replace unsort video to sort article['video'] = article['sort_video'].copy() # remove helper list del article['sort_video'] except Exception as e: helpers.write_to_log("Catch from parser.get_json_content()\n" + str(e))
def add_link_to_db(link): global connection is_link_added = check_link(link) try: cursor = connection.cursor() # if link not exist add it to db if is_link_added is False: cursor.execute("INSERT INTO links VALUES('" + link + "')") connection.commit() except Exception as e: helpers.write_to_log("Catch from db.add_link_to_db()\n" + str(e)) connection.close()
def get_links(xml): try: soup = bs4.BeautifulSoup(xml, features='xml') links = [] for item in soup.find_all('item'): pub_date = datetime.datetime.strptime(item.pubDate.text, '%a, %d %b %Y %H:%M:%S %z') links.append({"link": item.link.text, "pub_date": pub_date}) links = sorted(links, key=lambda k: k['pub_date'], reverse=False) return [link['link'] for link in links if 'link' in link] except Exception as e: helpers.write_to_log("Catch from parser.get_links()\n" + str(e))
def check_link(link): global connection is_link_exists = True try: cursor = connection.cursor() cursor.execute("SELECT link FROM links WHERE link='" + link + "'") # if link not exist add it to db if cursor.fetchone() is None: is_link_exists = False return is_link_exists except Exception as e: helpers.write_to_log("Catch from db.check_link()\n" + str(e)) connection.close()
def create_connection(db_file): global connection if connection is None: try: connection = sqlite3.connect(db_file) # check if table with links exist cursor = connection.cursor() cursor.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='links';" ) # if table links not exist create table links if cursor.fetchone() is None: cursor.execute("CREATE TABLE links (link text UNIQUE);") connection.commit() except Exception as e: helpers.write_to_log("Catch from db.create_connection()\n" + str(e)) connection.close()
def run_parse(link): try: article = { 'title': '', 'description': '', 'body': '', 'keywords': '', 'video': [], 'publisher': '', 'original_link': '', 'images': [] } get_json_content(get_html_content_by_link(link), article) is_correct_structure = parse_html_content( get_html_content_by_link(link), article) if is_correct_structure is False: return False else: return article except Exception as e: helpers.write_to_log("Catch from parser.run_parse()\n" + str(e))
def parse_html_content(html, article): try: first_soup = bs4.BeautifulSoup(html, features='lxml') soup = bs4.BeautifulSoup(str(first_soup), 'html.parser') # if soup is None exit from method div = soup.find('div', class_="text14") if div is None: return False span = div.find('span') # remove all comments from span for element in span(text=lambda text: isinstance(text, bs4.Comment)): element.extract() # find naked text and put it to time tag for node in span: if isinstance(node, bs4.element.NavigableString): # temporary tag tag = soup.new_tag('time') tag.append(str(node)) node.replace_with(tag) # find tags with text and images all_nodes = span.find_all() article['body'] += '<p>' + article["keywords"] + '</p>' article['body'] += '<p>' + article["description"] + '</p>' for node in all_nodes: # remove some not use tags for not_use_tag in node.find_all( ['script', 'link', 'form', 'label', 'input', 'iframe', 'img']): if not_use_tag.name == 'img': if not_use_tag['src'] not in article['images']: not_use_tag.extract() else: not_use_tag.extract() # checking if current node is paragraph if node.name == 'p': if node.a: article['body'] += str(node) children = node.findChildren('a') for child in children: child.decompose() elif node.font: article['body'] += '<b>' + node.font.text + '</b>' else: if str( node ) != '<p>Подключайтесь к Telegram-каналу "Вестей"</p>': article['body'] += str(node) if node.name == 'ul': article['body'] += str(node) # checking if current node is headers elif node.name == 'h1' or node.name == 'h2' or node.name == 'h3': if check_title_classes('sf_', node.get_attribute_list( 'class')) is False and len(node.findChildren()) == 0: article['body'] += '<h3>' + node.text + '</h3>' elif node.name == 'h4' or node.name == 'h5' or node.name == 'h6': if check_title_classes('sf_', node.get_attribute_list( 'class')) is False and len(node.findChildren()) == 0: article['body'] += '<h4>' + node.text + '</h4>' # checking if current node is img and get images from article with full link elif node.name == 'img' and 'https://' in node[ 'src'] and node.has_attr('title'): # and node['src'] in article['images']: img = '<img style="display: block; margin: 10px 0 0 0" src="' + node["src"] + '" title="' \ + node['title'] + '"' + 'width="450"/><br>' article['body'] += img article[ 'body'] += '<b style="display: block; margin: 0 0 10px 0">' + node[ "title"] + '</b><br>' # checking if current node time (temporary tag for not wrapper text from original html) elif node.name == 'time': if node.string != u'\xa0': article['body'] += node.text # checking if current node a elif node.name == 'a': if 'bluelink' in node.get_attribute_list('class'): if 'https://t.me/vestyisrael' not in node.get_attribute_list( 'href'): article['body'] += str(node) # checking if current node div elif node.name == 'div': if 'art_video' in node.get_attribute_list('class'): # put video iframe if article['video'] is not None: content = article['video'].pop() if 'youtube' in content['url']: article[ 'body'] += '<iframe width="450" height="450" frameborder="0" src="' + content[ 'url'] + '"></iframe>' else: article[ 'body'] += '<video width="450" height="450" frameborder="0" controls>' article['body'] += '<source src="' + content[ 'url'] + '" type="video/mp4"></video>' else: continue # bottom link to original source link = 'Ссылка на источник ' article['body'] += '<a href="' + article[ "original_link"] + '">' + link + article['publisher'] + '</a>' except Exception as e: helpers.write_to_log( "Catch from parser.parse_html_content()\n" + str(e), False)
def get_rss_xml(url): try: response = urllib.request.urlopen(url) return response.read() except Exception as e: helpers.write_to_log("Catch from parser.get_rss_xml()\n" + str(e))
db.create_connection('./vesti_links.db') # create connection to db for index, link in enumerate(news_links): if db.check_link(link) is False: # Check if link is valid and # add to db not valid link if parser.run_parse(link) is False: db.add_link_to_db(link) continue news_article = parser.run_parse(link).copy() current_token = telebot.telegraph_get_token() try: telebot.init_telegraph(current_token) telegraph = telebot.telegraph_create_page({ 'title': news_article['title'], 'content': news_article['body'] }) telebot.send_2_channel(telegraph['url']) db.add_link_to_db(link) news_article.clear() time.sleep(int(os.getenv('DELAY_TIME'))) except Exception as e: helpers.write_to_log("Catch from main\n" + str(e)) else: continue helpers.write_to_log("Bot finish", send_to_montitor_channel=False)