Ejemplo n.º 1
0
def webhook_handler():
    if flask.request.method == "POST":

        # Retrieve the message in JSON and then transform it to Telegram object
        update = telegram.Update.de_json(flask.request.get_json(force=True),
                                         bot)

        if update.message:
            # Regular message
            text = update.message.text
            user_id = update.message.from_user.id
            chat_id = update.message.chat_id
            username = update.message.from_user.username
            message_id = None
        elif update.callback_query:
            # Callback query
            text = update.callback_query.data
            user_id = update.callback_query.from_user.id
            chat_id = update.callback_query.message.chat_id
            username = update.callback_query.from_user.username
            message_id = update.callback_query.message.message_id
        else:
            logging.error("Received unknown update!")
            return constants.RESPONSE_OK

        # User must have username
        if not username:
            bot.sendMessage(chat_id, constants.ERROR_NO_USERNAME)
            return constants.RESPONSE_OK

        # Retrieve/Create user
        user = User.get_by_id(user_id)
        if not user:
            # New user
            logging.info("User %s not found! Creating new user...", user_id)
            user = User(id=user_id, chat_id=chat_id, username=username)
            user.put()
        else:
            # Existing user
            user.last_activity_date = datetime.now()
            if username != user.username:
                logging.debug("User %s has changed username from %s to %s",
                              user_id, user.username, username)
                user.username = username
            user.put()

        commands.handle_input(user, text, message_id)

        return constants.RESPONSE_OK
Ejemplo n.º 2
0
def crawler_comment_thread(requester, queue, index):
    fout = file('./tmp/comment.part.%d' % index, 'w')
    ferr = file('./tmp/comment.err.part.%d' % index, 'w')
    failure_count = 1
    while not queue.empty():
        try:
            url, count = queue.get(True, 30)
            if count == 10:
                log('10:' + url + '\n', ferr)
                continue
            html = requester.get(url)
            log('%d:%d:%s' % (queue.qsize(), index, url), fout)
            page = H.document_fromstring(html)
            shop_id = shop_id_pattern.search(url).group()
            comment_list_node = page.xpath(COMMENT_LIST_XPATH)
            for comment_block in comment_list_node:
                comment = Comment()
                user = User()
                comment.comment_id = comment_block.xpath(COMMENT_ID_XPATH)[0]
                comment.shop_id = shop_id
                user.user_id = comment_block.xpath(USER_ID_XPATH)[0]
                comment.user_id = user.user_id
                username_node = comment_block.xpath(USER_NAME_XPATH)
                if len(username_node) < 1:
                    log('no username:%s' % url, ferr)
                    continue
                user.username = comment_block.xpath(
                    USER_NAME_XPATH)[0].text_content()
                score_node = comment_block.xpath(SCORE_XPATH)
                comment.star = int(
                    number_pattern.search(
                        comment_block.xpath(SCORE_XPATH)[0].split(' ')
                        [1]).group()) / 10 if len(score_node) > 0 else 0
                average_node = comment_block.xpath(AVERAGE_PER_XPATH)
                comment.average = number_pattern.search(
                    average_node[0].text_content()).group(
                    ) if len(average_node) > 0 else 0
                comment.date = fill_date(
                    str(
                        comment_block.xpath(DATE_XPATH)[0].text_content().
                        encode('utf-8')).split('\xc2\xa0')[0], DATE_FORMAT)
                content_extra_node = comment_block.xpath(CONTENT_EXTRA_XPATH)
                if len(content_extra_node) > 0:
                    comment.content = content_extra_node[0].text_content(
                    ).strip()
                else:
                    comment.content = comment_block.xpath(
                        CONTENT_XPATH)[0].text_content().strip()
                other_score_node = comment_block.xpath(OTHER_SCORE_XPATH)
                comment.taste_score = 0
                comment.envir_score = 0
                comment.service_score = 0
                for each_node in other_score_node:
                    if TASTE_TAG in each_node.text_content():
                        comment.taste_score = each_node.text_content()[2]
                    elif ENVIR_TAG in each_node.text_content():
                        comment.envir_score = each_node.text_content()[2]
                    elif SERVICE_TAG in each_node.text_content():
                        comment.service_score = each_node.text_content()[2]
                #has_other_score = len(other_score_node) > 0
                #comment.taste_score = number_pattern.search(other_score_node[0].text_content()).group() if has_other_score else 0
                #comment.envir_score = number_pattern.search(other_score_node[1].text_content()).group() if has_other_score else 0
                #comment.service_score = number_pattern.search(other_score_node[2].text_content()).group() if has_other_score else 0
                user.save()
                comment.save()
            next_page_node = page.xpath(COMMENT_NEXT_XPATH)
            if len(next_page_node) > 0:
                pageno = next_page_node[0]
                comment_url_prefix = pageno_pattern.sub('', url)
                next_url = comment_url_prefix + pageno
                queue.put((next_url, 1))
            failure_count = 1
        except Empty, e:
            log('%d:Empty' % index, fout)
            break
        except urllib2.HTTPError, e:
            if e.code != 404:
                # 403 forbbidden
                queue.put((url, count))
                sleep(10 * failure_count)
                failure_count += 1
                if failure_count == 10:
                    log('%d:403:%s' % (index, url), fout)
            else:
                log('%d:404:error:%s' % (index, url), fout)
Ejemplo n.º 3
0
def crawler_comment_thread(requester, queue, index):
    fout = file('./tmp/comment.part.%d' % index, 'w')
    ferr = file('./tmp/comment.err.part.%d' % index, 'w')
    failure_count = 1
    while not queue.empty():
        try:
            url, count = queue.get(True, 30)
            if count == 10:
                log('10:' + url + '\n', ferr)
                continue
            html = requester.get(url)
            log('%d:%d:%s' % (queue.qsize(), index, url), fout)
            page = H.document_fromstring(html)
            shop_id = shop_id_pattern.search(url).group()
            comment_list_node = page.xpath(COMMENT_LIST_XPATH)
            for comment_block in comment_list_node:
                comment = Comment()
                user = User()
                comment.comment_id = comment_block.xpath(COMMENT_ID_XPATH)[0]
                comment.shop_id = shop_id
                user.user_id = comment_block.xpath(USER_ID_XPATH)[0]
                comment.user_id = user.user_id
                username_node = comment_block.xpath(USER_NAME_XPATH)
                if len(username_node) < 1:
                    log('no username:%s' % url, ferr)
                    continue
                user.username = comment_block.xpath(USER_NAME_XPATH)[0].text_content()
                score_node = comment_block.xpath(SCORE_XPATH)
                comment.star = int(number_pattern.search(comment_block.xpath(SCORE_XPATH)[0].split(' ')[1]).group()) / 10 if len(score_node) > 0 else 0
                average_node = comment_block.xpath(AVERAGE_PER_XPATH)
                comment.average = number_pattern.search(average_node[0].text_content()).group() if len(average_node) > 0 else 0
                comment.date = fill_date(str(comment_block.xpath(DATE_XPATH)[0].text_content().encode('utf-8')).split('\xc2\xa0')[0], DATE_FORMAT)
                content_extra_node = comment_block.xpath(CONTENT_EXTRA_XPATH)
                if len(content_extra_node) > 0:
                    comment.content = content_extra_node[0].text_content().strip()
                else:
                    comment.content = comment_block.xpath(CONTENT_XPATH)[0].text_content().strip()
                other_score_node = comment_block.xpath(OTHER_SCORE_XPATH)
                comment.taste_score = 0
                comment.envir_score = 0
                comment.service_score = 0
                for each_node in other_score_node:
                    if TASTE_TAG in each_node.text_content():
                        comment.taste_score = each_node.text_content()[2]
                    elif ENVIR_TAG in each_node.text_content():
                        comment.envir_score = each_node.text_content()[2]
                    elif SERVICE_TAG in each_node.text_content():
                        comment.service_score = each_node.text_content()[2]
                #has_other_score = len(other_score_node) > 0
                #comment.taste_score = number_pattern.search(other_score_node[0].text_content()).group() if has_other_score else 0
                #comment.envir_score = number_pattern.search(other_score_node[1].text_content()).group() if has_other_score else 0
                #comment.service_score = number_pattern.search(other_score_node[2].text_content()).group() if has_other_score else 0
                user.save()
                comment.save()
            next_page_node = page.xpath(COMMENT_NEXT_XPATH)
            if len(next_page_node) > 0:
                pageno = next_page_node[0]
                comment_url_prefix = pageno_pattern.sub('', url)
                next_url = comment_url_prefix + pageno
                queue.put((next_url, 1))
            failure_count = 1
        except Empty,e:
            log('%d:Empty' % index, fout)
            break
        except urllib2.HTTPError,e:
            if e.code != 404:
                # 403 forbbidden
                queue.put((url, count))
                sleep(10 * failure_count)
                failure_count += 1
                if failure_count == 10:
                    log('%d:403:%s' % (index, url), fout)
            else:
                log('%d:404:error:%s' % (index, url), fout)