Ejemplo n.º 1
0
def main():
    """Manages overall detection process, from beginning to end. This includes:
 - Read settings
 - Initialize processing
 - Error handling"""

    # read settings. No file passed implies default settings
    settings = SettingReader.SettingReader(None)
    # print settings.values['Global']['startingpoint']

    # initialize processing
    current_position = initialize(settings)

    # deduce steps that will have to be executed
    steps_to_execute = structure[structure.index(current_position):]

    # execute steps
    for step in steps_to_execute:
        helpers.write_to_log(settings=settings, line=step)
        print '### ' + step
        r = globals()[step[3:]](settings, structure, debug)
        if r != 0:
            break


    return 0
Ejemplo n.º 2
0
def get_html_content_by_link(link):
    print("HERE", link)
    try:
        response = urllib.request.urlopen(link)
        return response.read()
    except Exception as e:
        helpers.write_to_log("Catch from parser.get_html_content_by_link()\n" +
                             str(e))
Ejemplo n.º 3
0
def get_json_content(html, article):
    try:
        soup = bs4.BeautifulSoup(html, features='lxml')
        all_jsons = soup.find_all('script', type='application/ld+json')

        for item in all_jsons:
            # convert data from string to json
            item = json.loads(item.text)

            # NewsArticle contain title, description, author image list,
            if item['@type'] == "NewsArticle":
                article['title'] = item['headline']
                article['description'] = item['description']
                article['keywords'] = item['keywords']
                article['publisher'] = item['publisher']['name']
                article['original_link'] = item['mainEntityOfPage']['@id']
                article['images'] = item['image'].copy()

            elif item['@type'] == "VideoObject":
                if article['video'] is None:
                    article['video'] = []

                if item['uploadDate'] is not None and item['uploadDate'] != "":
                    article['video'].append({
                        'url':
                        get_video_link(item['contentUrl'], item['embedUrl']),
                        'upload_date':
                        datetime.datetime.strptime(item['uploadDate'],
                                                   '%Y-%m-%dT%H:%M:%SZ')
                    })
                else:
                    article['video'].append({
                        'url':
                        get_video_link(item['contentUrl'], item['embedUrl']),
                        'upload_date':
                        datetime.datetime.today()
                    })
            else:
                continue

        # if video more than one needs to sort it by upload date
        if len(article['video']) > 1:
            # create helper list with sort video
            article['sort_video'] = sorted(article['video'],
                                           key=lambda k: k['upload_date'],
                                           reverse=False)
            # replace unsort video to sort
            article['video'] = article['sort_video'].copy()
            # remove helper list
            del article['sort_video']
    except Exception as e:
        helpers.write_to_log("Catch from parser.get_json_content()\n" + str(e))
Ejemplo n.º 4
0
def add_link_to_db(link):
    global connection
    is_link_added = check_link(link)
    try:
        cursor = connection.cursor()

        # if link not exist add it to db
        if is_link_added is False:
            cursor.execute("INSERT INTO links VALUES('" + link + "')")
            connection.commit()
    except Exception as e:
        helpers.write_to_log("Catch from db.add_link_to_db()\n" + str(e))
        connection.close()
Ejemplo n.º 5
0
def get_links(xml):
    try:
        soup = bs4.BeautifulSoup(xml, features='xml')
        links = []
        for item in soup.find_all('item'):
            pub_date = datetime.datetime.strptime(item.pubDate.text,
                                                  '%a, %d %b %Y %H:%M:%S %z')
            links.append({"link": item.link.text, "pub_date": pub_date})
        links = sorted(links, key=lambda k: k['pub_date'], reverse=False)

        return [link['link'] for link in links if 'link' in link]
    except Exception as e:
        helpers.write_to_log("Catch from parser.get_links()\n" + str(e))
Ejemplo n.º 6
0
def check_link(link):
    global connection
    is_link_exists = True
    try:
        cursor = connection.cursor()
        cursor.execute("SELECT link FROM links WHERE link='" + link + "'")

        # if link not exist add it to db
        if cursor.fetchone() is None:
            is_link_exists = False
        return is_link_exists

    except Exception as e:
        helpers.write_to_log("Catch from db.check_link()\n" + str(e))
        connection.close()
Ejemplo n.º 7
0
def create_connection(db_file):
    global connection
    if connection is None:
        try:
            connection = sqlite3.connect(db_file)

            # check if table with links exist
            cursor = connection.cursor()
            cursor.execute(
                "SELECT name FROM sqlite_master WHERE type='table' AND name='links';"
            )

            # if table links not exist create table links
            if cursor.fetchone() is None:
                cursor.execute("CREATE TABLE links (link text UNIQUE);")
                connection.commit()

        except Exception as e:
            helpers.write_to_log("Catch from db.create_connection()\n" +
                                 str(e))
            connection.close()
Ejemplo n.º 8
0
def run_parse(link):
    try:
        article = {
            'title': '',
            'description': '',
            'body': '',
            'keywords': '',
            'video': [],
            'publisher': '',
            'original_link': '',
            'images': []
        }
        get_json_content(get_html_content_by_link(link), article)
        is_correct_structure = parse_html_content(
            get_html_content_by_link(link), article)

        if is_correct_structure is False:
            return False
        else:
            return article
    except Exception as e:
        helpers.write_to_log("Catch from parser.run_parse()\n" + str(e))
Ejemplo n.º 9
0
def parse_html_content(html, article):
    try:
        first_soup = bs4.BeautifulSoup(html, features='lxml')
        soup = bs4.BeautifulSoup(str(first_soup), 'html.parser')

        # if soup is None exit from method

        div = soup.find('div', class_="text14")

        if div is None:
            return False

        span = div.find('span')

        # remove all comments from span
        for element in span(text=lambda text: isinstance(text, bs4.Comment)):
            element.extract()

        # find naked text and put it to time tag
        for node in span:
            if isinstance(node, bs4.element.NavigableString):
                # temporary tag
                tag = soup.new_tag('time')
                tag.append(str(node))
                node.replace_with(tag)

        # find tags with text and images
        all_nodes = span.find_all()

        article['body'] += '<p>' + article["keywords"] + '</p>'
        article['body'] += '<p>' + article["description"] + '</p>'

        for node in all_nodes:
            # remove some not use tags
            for not_use_tag in node.find_all(
                ['script', 'link', 'form', 'label', 'input', 'iframe', 'img']):
                if not_use_tag.name == 'img':
                    if not_use_tag['src'] not in article['images']:
                        not_use_tag.extract()
                else:
                    not_use_tag.extract()

            # checking if current node is paragraph
            if node.name == 'p':
                if node.a:
                    article['body'] += str(node)
                    children = node.findChildren('a')
                    for child in children:
                        child.decompose()
                elif node.font:
                    article['body'] += '<b>' + node.font.text + '</b>'
                else:
                    if str(
                            node
                    ) != '<p>Подключайтесь к Telegram-каналу "Вестей"</p>':
                        article['body'] += str(node)

            if node.name == 'ul':
                article['body'] += str(node)

            # checking if current node is headers
            elif node.name == 'h1' or node.name == 'h2' or node.name == 'h3':
                if check_title_classes('sf_', node.get_attribute_list(
                        'class')) is False and len(node.findChildren()) == 0:
                    article['body'] += '<h3>' + node.text + '</h3>'

            elif node.name == 'h4' or node.name == 'h5' or node.name == 'h6':
                if check_title_classes('sf_', node.get_attribute_list(
                        'class')) is False and len(node.findChildren()) == 0:
                    article['body'] += '<h4>' + node.text + '</h4>'

            # checking if current node is img and get images from article with full link
            elif node.name == 'img' and 'https://' in node[
                    'src'] and node.has_attr('title'):
                # and node['src'] in article['images']:
                img = '<img style="display: block; margin: 10px 0 0 0" src="' + node["src"] + '" title="' \
                      + node['title'] + '"' + 'width="450"/><br>'
                article['body'] += img
                article[
                    'body'] += '<b style="display: block; margin: 0 0 10px 0">' + node[
                        "title"] + '</b><br>'

            # checking if current node time (temporary tag for not wrapper text from original html)
            elif node.name == 'time':
                if node.string != u'\xa0':
                    article['body'] += node.text

            # checking if current node a
            elif node.name == 'a':
                if 'bluelink' in node.get_attribute_list('class'):
                    if 'https://t.me/vestyisrael' not in node.get_attribute_list(
                            'href'):
                        article['body'] += str(node)

            # checking if current node div
            elif node.name == 'div':
                if 'art_video' in node.get_attribute_list('class'):
                    # put video iframe
                    if article['video'] is not None:
                        content = article['video'].pop()
                        if 'youtube' in content['url']:
                            article[
                                'body'] += '<iframe width="450" height="450" frameborder="0" src="' + content[
                                    'url'] + '"></iframe>'
                        else:
                            article[
                                'body'] += '<video width="450" height="450" frameborder="0" controls>'
                            article['body'] += '<source src="' + content[
                                'url'] + '" type="video/mp4"></video>'
            else:
                continue

        # bottom link to original source
        link = 'Ссылка на источник '
        article['body'] += '<a href="' + article[
            "original_link"] + '">' + link + article['publisher'] + '</a>'

    except Exception as e:
        helpers.write_to_log(
            "Catch from parser.parse_html_content()\n" + str(e), False)
Ejemplo n.º 10
0
def get_rss_xml(url):
    try:
        response = urllib.request.urlopen(url)
        return response.read()
    except Exception as e:
        helpers.write_to_log("Catch from parser.get_rss_xml()\n" + str(e))
Ejemplo n.º 11
0
db.create_connection('./vesti_links.db')  # create connection to db

for index, link in enumerate(news_links):
    if db.check_link(link) is False:
        # Check if link is valid and
        # add to db not valid link
        if parser.run_parse(link) is False:
            db.add_link_to_db(link)
            continue

        news_article = parser.run_parse(link).copy()
        current_token = telebot.telegraph_get_token()
        try:
            telebot.init_telegraph(current_token)
            telegraph = telebot.telegraph_create_page({
                'title':
                news_article['title'],
                'content':
                news_article['body']
            })
            telebot.send_2_channel(telegraph['url'])
            db.add_link_to_db(link)
            news_article.clear()
            time.sleep(int(os.getenv('DELAY_TIME')))
        except Exception as e:
            helpers.write_to_log("Catch from main\n" + str(e))
    else:
        continue

helpers.write_to_log("Bot finish", send_to_montitor_channel=False)