def _fetch_article(self, url):
        """
        Fetch the article, clean it.

        Args:
            url (str): url of the article
        """
        article = self._parse_article(url)

        if not article['text']:

            try:
                alternative_text = eatiht.extract(url)
                article['text'] = alternative_text

            except:
                logger.info('%s dropped', url)
                return

        if not (article['title'] or article['text']):
            logger.info('%s dropped', url)
            return

        article = self._clean_dict(article)

        article['src'] = self._src
        article['type'] = 'article'
        self._db_interface.insert_one('articles', article)
        self._db_interface.insert_one('timeline_items', article)
Exemple #2
0
def extract_by_eatiht(html):
    f = BytesIO(html.encode('utf-8'))
    try:
        body = ensure_unicode(eatiht.extract(f))
    except IndexError:
        body = '!!ERROR!!'

    return {
        'body': body,
    }
Exemple #3
0
def alt_extract(url):
    client = Client('')
    api = AlchemyAPI(client)
    req = api.text('url', url)
    if not req['status'] == 'ERROR':
        clean = req.get('text')
    else:
        clean = eatiht.extract(url)
    return '\n\n'.join(
        [re.sub(r'\s+', ' ', i.strip()) for i in clean.split('\n')])
Exemple #4
0
def alt_extract(url):
    client = Client('')
    api = AlchemyAPI(client)
    req = api.text('url', url)
    if not req['status'] == 'ERROR':
        clean = req.get('text')
    else:
        clean = eatiht.extract(url)
    return '\n\n'.join(
        [re.sub(r'\s+', ' ', i.strip()) for i in clean.split('\n')])
Exemple #5
0
def try_eatiht():
    print("===EATIHT V2===")
    tree = etv2.extract(ARTICLE)
    tree.bootstrapify()
    print(tree.get_html_string())

    print("===V2===")
    print(v2.extract(ARTICLE))

    print("===V1===")
    print(eatiht.extract(ARTICLE))
def readHTML(url,db):
    try:
        # html =  urllib.urlopen(url).read()
        # soup = BeautifulSoup(html, 'html.parser')

        # cleanhtml = clean_html(html)
        # text = clean_html(soup.getText())
        text = clean_html(eatiht.extract(url))
        return (text,True)
    except :
        print("problem with loading ",url)
        db.newswebdump.update({"link": url}, {"$set": {"crawled": False, "meta": "ignore"}})

    return ("",False)
Exemple #7
0
def bestMethod(url, req):
    eat = ''
    pipe = ''
    manual = ''
    if url[-4:] in [".jpg", ".pdf", ".mp3", ".mp4"]:
        logError("FORMAT: " + claim_source_url + "\n")
        return ("", "format")
    try:
        eat = eatiht.extract(url).replace("\n", "")
    except:
        pass
    try:
        pipe = Extractor(extractor='ArticleExtractor',
                         url=url).getText().replace("\n", "")
    except:
        pass
    # try:
    manual = tryContent(req)
    # except:
    # pass
    return max([(eat, "eat"), (pipe, "pipe"), (manual, "manual")],
               key=lambda x: len(x[0]))
def handle(msg):
    """
    Just do the actions for each command listed in the conditions.
    We have to refactor it puting all commands at a tuple or list, etc.
    The bot username will be dinamically alterable, too.
    """
    global bot_names
    command = ''
    print msg
    content_type, chat_type, chat_id = telepot.glance2(msg)
    print content_type, chat_type, chat_id
    if content_type == 'text':
        command = utf8_encode(msg['text'].lower())

    name = utf8_encode(msg['from']['first_name'])

    print 'Got command: %s' % command

    names_to_check = verify_text(bot_names, command)
    print names_to_check

    if names_to_check or chat_type == 'private':
        command = remove_bot_name(names_to_check, command)
        print command

        if command.lower() in morning_words:
            bot.sendMessage(chat_id, "Good morning, {}!".format(name))
        elif command.lower() in night_words:
            bot.sendMessage(chat_id, "Good night, {}!".format(name))
        elif command == 'que dia é hoje?':
            day = datetime.datetime.now().day
            month = months.get(calendar.month_name[datetime.datetime.now().month], '')
            bot.sendMessage(
                chat_id, "É dia de você calar essa boca. \n\nBrincadeira, hoje é dia {day} de {month} \U0001f605".format(day=day, month=month))
        elif command == 'bem vindo!':
            if welcome_count < 1:
                msg_wel = "Eu sempre estive aqui, idiota! {}".format(Emoji.NEUTRAL_FACE)
            elif welcome_count == 1:
                msg_wel = "Não repito. {}".format(Emoji.NEUTRAL_FACE)
            else:
                msg_wel = "{}".format(Emoji.NEUTRAL_FACE)
            bot.sendMessage(chat_id, msg_wel)
            globals()['welcome_count'] += 1
            print globals()['welcome_count']
        elif command.lower() in love_words:
            msgs = [
                "Eu tambem amo vc, {}! {}{}".format(name,
                                                    Emoji.BLACK_HEART_SUIT,
                                                    Emoji.BLACK_HEART_SUIT),
                "Legal.",
            ]
            msg = random.choice(msgs)
            bot.sendMessage(chat_id, msg)
            if msg == "Legal.":
                bot.sendChatAction(chat_id, 'upload_document')
                bot.sendDocument(chat_id, "BQADBAADdwMAAgMdZAdPtWmOPGN1IQI")
        elif command.lower() == 'que horas são?':
            msg = "É muita hipocrisia da sua parte me perguntar isso {}... "\
                  "Você pode vizualisar facilmente as horas olhando para parte "\
                  "inferior direita do seu comentário."
            bot.sendMessage(chat_id, msg.format(name))
            bot.sendChatAction(chat_id, 'upload_document')
            bot.sendDocument(chat_id, "BQADAQADEwADnqxzCGp0fqkzsPC6Ag")
        elif command.lower() == 'nós te amamos!':
            msg = "Ah é?! Foda-se."
            bot.sendMessage(chat_id, msg)
            bot.sendChatAction(chat_id, 'upload_document')
            bot.sendDocument(chat_id, "BQADBAADYwMAAiUcZAe1DjlP-IMGhQI")
        elif command.lower() == 'é bininu binina ou binunu binino?'.lower():
            msg = "bininu."
            bot.sendMessage(chat_id, msg)
        elif command.lower() == 'qual sua idade?':
            msg = "Você sabe a idade de Deus, seu criador? Pois é, sou 1 ano mais novo que Ele."
            bot.sendMessage(chat_id, msg)
        elif command.lower() == '/emojis':
            msg1, msg2, msg3, msg4, msg5 = get_all_emojis()
            bot.sendMessage(chat_id, msg1)
            bot.sendMessage(chat_id, msg2)
            bot.sendMessage(chat_id, msg3)
            bot.sendMessage(chat_id, msg4)
            bot.sendMessage(chat_id, msg5)
        elif command.lower() in fuck_words:
            msg = [
                "Querido, por favor! Tenha boas maneiras! Você tem que me convidar pra jantar primeiro.",
                "Entre na fila.",
                "Sonhando novamente, querido?",
                "Se sentindo sozinho de novo, ha?",
                "É só eu ou você diz isso para todos?",
                "Sério? Agora?",
                "Não obrigado. Eu passo.",
            ]
            bot.sendMessage(chat_id, random.choice(msg))
            # bot.sendChatAction(chat_id, 'upload_document')
            # bot.sendDocument(chat_id, "BQADBAADdwMAAgMdZAdPtWmOPGN1IQI")
        elif verify_text(command.lower().split(), 'trans'):
            msg = command.lower().replace('trans ', '')
            if msg.split()[0] == 'pt':
                msg = msg.replace("pt ", "", 1)
                print msg
                os.system('echo "{}" | trans -b -o ~/output.txt :pt'.format(msg))
            else:
                os.system('echo "{}" | trans -b -o ~/output.txt :en'.format(msg))
            with open(os.environ['HOME'] + '/output.txt', 'r') as content_file:
                content = content_file.read()
            bot.sendMessage(chat_id, content)
        elif equals_text(quote_words, command):
            msg = get_quotes(db, bot, chat_id)
            bot.sendMessage(chat_id, re.sub(' +', ' ', msg.replace('.', ',')))
        # jokes
        elif equals_text(joke_words, command):
            jokes = my_shuffle(Joke(db).get_jokes())
            bot.sendMessage(chat_id, random.choice(jokes))
        elif verify_text(lyrics_words, command):
            lyrics, status = get_lyrics(command)
            bot.sendMessage(chat_id, lyrics, parse_mode='Markdown')
        elif verify_text(extractor_words, command):
            url = command.split()[1]
            msg = v2.extract(url)
            try:
                bot.sendMessage(chat_id, msg)
            except BadHTTPResponse:
                msg = ("*Infelizmente o texto é muito grande e excedeu nosso limite."
                       " Por favor tente extrair textos um pouco menores.*")
                bot.sendMessage(chat_id, msg, parse_mode='Markdown')
        else:
            cnt_ed = count_ed_mgs(db)
            cnt_simsimi = count_simsimi_msg(db)
            on_the_music_group_id = -82861655
            los_primos_group_id = -16994629
            shit_group = -78912892
            fucked_list_group = [los_primos_group_id, on_the_music_group_id, shit_group]
            limit_ed = 5 if chat_id == sminino_group_id else 1 if chat_id in fuck_list_group else 3
            limit_simsimi = 2 if chat_id == sminino_group_id else 8 if chat_id in fuck_list_group else 3
            print 'limit_ed', limit_ed
            print 'limit_simsimi', limit_simsimi
            if cnt_ed < limit_ed:
                response, sim_status, robot_name = get_ed_reply(command)
            elif cnt_simsimi < limit_simsimi:
                response, sim_status, robot_name = get_simsimi_reply(command)
            else:
                response, ed_status, robot_name = get_ed_reply(command)
                if ed_status != 200:
                    response, sim_status, robot_name = get_simsimi_reply(command)
                q = {'qty_answed_message': 0}
                db.set('ed_info', q)
                db.set('simsimi_info', q)
                db.dump()

            if verify_text(['Fui criado e program', 'O meu inventor'], response):
                developed_by_texts = db.get('developed_by')
                olds = [utf8_encode(text) for text in developed_by_texts['old']]
                news = [utf8_encode(text) for text in developed_by_texts['new']]
                if 'Fui criado e program' in response:
                    response = response.replace(
                            olds[0],
                            news[0])
                    response += ' {}'.format(Emoji.GRINNING_FACE)
                if 'O meu inventor' in response:
                    response = response.replace(
                            olds[1],
                            news[1])
            info_sent = bot.sendMessage(chat_id, response)
            if info_sent:
                print robot_name
                try:
                    count_msg = db.get('{}_info'.format(robot_name))['qty_answed_message']
                    count_msg += 1
                except:
                    count_msg = 0
                q = {'qty_answed_message': count_msg}
                db.set('{}_info'.format(robot_name), q)
                db.dump()
    elif verify_text(command.lower().split(), 'kkk'*15):
            msgs = [
                'hahaha',
                'kkkk',
            ]
            bot.sendMessage(chat_id, random.choice(msgs))
def handle(msg):
    """
    Just do the actions for each command listed in the conditions.
    We have to refactor it puting all commands at a tuple or list, etc.
    The bot username will be dinamically alterable, too.
    """
    global bot_names
    command = ''
    print msg
    content_type, chat_type, chat_id = telepot.glance2(msg)
    print content_type, chat_type, chat_id
    if content_type == 'text':
        command = utf8_encode(msg['text'].lower())

    name = utf8_encode(msg['from']['first_name'])

    print 'Got command: %s' % command

    names_to_check = verify_text(bot_names, command)
    print names_to_check

    if names_to_check or chat_type == 'private':
        command = remove_bot_name(names_to_check, command)
        print command

        if command.lower() in morning_words:
            bot.sendMessage(chat_id, "Good morning, {}!".format(name))
        elif command.lower() in night_words:
            bot.sendMessage(chat_id, "Good night, {}!".format(name))
        elif command == 'que dia é hoje?':
            day = datetime.datetime.now().day
            month = months.get(calendar.month_name[datetime.datetime.now().month], '')
            bot.sendMessage(
                chat_id, "É dia de você calar essa boca. \n\nBrincadeira, hoje é dia {day} de {month} \U0001f605".format(day=day, month=month))
        elif command == 'bem vindo!':
            if welcome_count < 1:
                msg_wel = "Eu sempre estive aqui, idiota! {}".format(Emoji.NEUTRAL_FACE)
            elif welcome_count == 1:
                msg_wel = "Não repito. {}".format(Emoji.NEUTRAL_FACE)
            else:
                msg_wel = "{}".format(Emoji.NEUTRAL_FACE)
            bot.sendMessage(chat_id, msg_wel)
            globals()['welcome_count'] += 1
            print globals()['welcome_count']
        elif command.lower() in love_words:
            msgs = [
                "Eu tambem amo vc, {}! {}{}".format(name,
                                                    Emoji.BLACK_HEART_SUIT,
                                                    Emoji.BLACK_HEART_SUIT),
                "Legal.",
            ]
            msg = random.choice(msgs)
            bot.sendMessage(chat_id, msg)
            if msg == "Legal.":
                bot.sendChatAction(chat_id, 'upload_document')
                bot.sendDocument(chat_id, "BQADBAADdwMAAgMdZAdPtWmOPGN1IQI")
        elif command.lower() == 'que horas são?':
            msg = "É muita hipocrisia da sua parte me perguntar isso {}... "\
                  "Você pode vizualisar facilmente as horas olhando para parte "\
                  "inferior direita do seu comentário."
            bot.sendMessage(chat_id, msg.format(name))
            bot.sendChatAction(chat_id, 'upload_document')
            bot.sendDocument(chat_id, "BQADAQADEwADnqxzCGp0fqkzsPC6Ag")
        elif command.lower() == 'nós te amamos!':
            msg = "Ah é?! Foda-se."
            bot.sendMessage(chat_id, msg)
            bot.sendChatAction(chat_id, 'upload_document')
            bot.sendDocument(chat_id, "BQADBAADYwMAAiUcZAe1DjlP-IMGhQI")
        elif command.lower() == 'é bininu binina ou binunu binino?'.lower():
            msg = "bininu."
            bot.sendMessage(chat_id, msg)
        elif command.lower() == 'qual sua idade?':
            msg = "Você sabe a idade de Deus, seu criador? Pois é, sou 1 ano mais novo que Ele."
            bot.sendMessage(chat_id, msg)
        elif command.lower() == '/emojis':
            msg1, msg2, msg3, msg4, msg5 = get_all_emojis()
            bot.sendMessage(chat_id, msg1)
            bot.sendMessage(chat_id, msg2)
            bot.sendMessage(chat_id, msg3)
            bot.sendMessage(chat_id, msg4)
            bot.sendMessage(chat_id, msg5)
        elif command.lower() in fuck_words:
            msg = [
                "Querido, por favor! Tenha boas maneiras! Você tem que me convidar pra jantar primeiro.",
                "Entre na fila.",
                "Sonhando novamente, querido?",
                "Se sentindo sozinho de novo, ha?",
                "É só eu ou você diz isso para todos?",
                "Sério? Agora?",
                "Não obrigado. Eu passo.",
            ]
            bot.sendMessage(chat_id, random.choice(msg))
            # bot.sendChatAction(chat_id, 'upload_document')
            # bot.sendDocument(chat_id, "BQADBAADdwMAAgMdZAdPtWmOPGN1IQI")
        elif verify_text(command.lower().split(), 'trans'):
            msg = command.lower().replace('trans ', '')
            if msg.split()[0] == 'pt':
                msg = msg.replace("pt ", "", 1)
                print msg
                os.system('echo "{}" | trans -b -o ~/output.txt :pt'.format(msg))
            else:
                os.system('echo "{}" | trans -b -o ~/output.txt :en'.format(msg))
            with open(os.environ['HOME'] + '/output.txt', 'r') as content_file:
                content = content_file.read()
            bot.sendMessage(chat_id, content)
        elif equals_text(quote_words, command):
            msg = get_quotes(db, bot, chat_id)
            bot.sendMessage(chat_id, re.sub(' +', ' ', msg.replace('.', ',')))
        # jokes
        elif equals_text(joke_words, command):
            jokes = my_shuffle(Joke(db).get_jokes())
            bot.sendMessage(chat_id, random.choice(jokes))
        elif verify_text(lyrics_words, command):
            lyrics, status = get_lyrics(command)
            bot.sendMessage(chat_id, lyrics, parse_mode='Markdown')
        elif verify_text(extractor_words, command):
            url = command.split()[1]
            msg = v2.extract(url)
            try:
                bot.sendMessage(chat_id, msg)
            except BadHTTPResponse:
                msg = ("*Infelizmente o texto é muito grande e excedeu nosso limite."
                       " Por favor tente extrair textos um pouco menores.*")
                bot.sendMessage(chat_id, msg, parse_mode='Markdown')
        else:
            cnt_ed = count_ed_mgs(db)
            cnt_simsimi = count_simsimi_msg(db)
            on_the_music_group_id = -82861655
            los_primos_group_id = -16994629
            shit_group = -78912892
            fucked_list_group = [los_primos_group_id, on_the_music_group_id, shit_group]
            limit_ed = 5 if chat_id == sminino_group_id else 1 if chat_id in fuck_list_group else 3
            limit_simsimi = 2 if chat_id == sminino_group_id else 8 if chat_id in fuck_list_group else 3
            print 'limit_ed', limit_ed
            print 'limit_simsimi', limit_simsimi
            if cnt_ed < limit_ed:
                response, sim_status, robot_name = get_ed_reply(command)
            elif cnt_simsimi < limit_simsimi:
                response, sim_status, robot_name = get_simsimi_reply(command)
            else:
                response, ed_status, robot_name = get_ed_reply(command)
                if ed_status != 200:
                    response, sim_status, robot_name = get_simsimi_reply(command)
                q = {'qty_answed_message': 0}
                db.set('ed_info', q)
                db.set('simsimi_info', q)
                db.dump()

            if verify_text(['Fui criado e program', 'O meu inventor'], response):
                developed_by_texts = db.get('developed_by')
                olds = [utf8_encode(text) for text in developed_by_texts['old']]
                news = [utf8_encode(text) for text in developed_by_texts['new']]
                if 'Fui criado e program' in response:
                    response = response.replace(
                            olds[0],
                            news[0])
                    response += ' {}'.format(Emoji.GRINNING_FACE)
                if 'O meu inventor' in response:
                    response = response.replace(
                            olds[1],
                            news[1])
            info_sent = bot.sendMessage(chat_id, response)
            if info_sent:
                print robot_name
                try:
                    count_msg = db.get('{}_info'.format(robot_name))['qty_answed_message']
                    count_msg += 1
                except:
                    count_msg = 0
                q = {'qty_answed_message': count_msg}
                db.set('{}_info'.format(robot_name), q)
                db.dump()
    elif verify_text(command.lower().split(), 'kkk'*15):
            msgs = [
                'hahaha',
                'kkkk',
            ]
            bot.sendMessage(chat_id, random.choice(msgs))
Exemple #10
0
 def scrape(self):
     print(self.url)
     self.text = eatiht.extract(self.url)
def extract_by_eatiht(html):
    f = BytesIO(html.encode('utf-8'))

    return {
        'body': ensure_unicode(eatiht.extract(f)),
    }