def fetch_articles_list(boardName, boardId, page):
    url = config.base_url + '/bbsdoc.php?board=' + boardName + '&ftype=0&page=' + str(page)
    #logger.debug(url)
    html = newsm_common.request_get(url, 'GB18030', 20, 10)
    if html is None:
        logger.error('URL request failed: ' + url)
        return
    # logger.debug(html)
    # c.o(1,1,'loury','m ',985656622,'[公告]同意开设"Python/Python语言"看版 (转载) ',0,0,0);
    result = re.compile('c\.o\((\d+),(\d+),\'([^\']+)\',\'([^\']+)\',(\d+),\'([^\']+)\',(\d+),(\d+),(\d+)\)').findall(html)

    articles = []
    if (len(result) > 0):
        for line in result:
            article = {
                '_id': str(boardId) + '.' + line[0],
                'title':line[5].strip(),
                'parent_id':str(boardId) + '.' + line[1],
                'author':line[2].strip(),
                'size':int(line[6]),
                'flag': line[3].strip(),
                'board_name': boardName.strip(),
                'board_id': boardId,
                'created_at':int(line[4])
            }
            articles.append(article)
    return articles
Exemple #2
0
def browseBoard(name, id, sectionId):
    url = config.base_url + '/bbsdoc.php?board=' + name
    content = newsm_common.request_get(url, 'GB18030', 20, 10)
    content = content.replace(u'\u3000', u'')
    #logger.debug(content)
    boards = []
    result = re.compile(r'o\.o\([^\)]*\)').findall(content)
    #logger.debug(result)
    for line in result:
        match = re.match(
            r'o\.o\((true|false),(\d+),(\d+),(\d+),\'\[([^\]]*)\]\',\s*\'([^\']+)\',\s*\'([^\']+)\',\s*\'([^\']*)\',(\d+),(\d+),(\d+)\)',
            line)
        # logger.debug(match.group())
        board = {}
        board['_id'] = int(match.group(3))
        board['name'] = match.group(6)
        board['name2'] = match.group(7)
        board['moderators'] = match.group(8)
        board['section_name'] = match.group(5)
        board['unkown1'] = int(match.group(4))
        board['unkown2'] = int(match.group(10))
        board['online'] = int(match.group(11))
        board['is_folder'] = match.group(1)
        board['post_count'] = int(match.group(9))
        board['section_id'] = sectionId
        board['parent_id'] = id
        boards.append(board)
        # logger.debug(board)
        if board['is_folder'] == 'true':
            sub_boards = browseBoard(board['name'], board['_id'], sectionId)
            boards.extend(sub_boards)
    return boards
Exemple #3
0
def browseSection(id):
    url = config.base_url + '/bbsfav.php?select=' + str(id) + '&x'
    content = newsm_common.request_get(url, 'GB18030', 20, 10)
    content = content.replace(u'\u3000', u'')
    #logger.debug(content)
    boards = []
    # list all sections
    result = re.compile(r'o\.f\([^\)]*\)').findall(content)
    if (len(result) > 0):
        for line in result:
            match = re.match(
                r'o\.f\((\d+),\'([^\s]*)\s+(.*)\',(\d+),\'(.*)\'\)', line)
            section = {}
            section['parent_id'] = id
            section['_id'] = int(match.group(1))
            section['name'] = match.group(5)
            section['name2'] = match.group(2)
            section['desc'] = match.group(3)
            section['rank'] = int(match.group(4))
            #all_sections.append(section)
            #logger.debug(section)
            sub_boards = browseSection(section['_id'])
            boards.extend(sub_boards)

    # list all boards
    # o.o(false,1,1161,23473,'[清华]','CECM.THU','清华土木建管','ghostzb',21767,0,1);
    # 版面or目录, group, group2, (不知道), 分区名, 版面名, 版面中文名, 版主(可能为空), 帖子数, (不知道), 在线数
    result = re.compile(r'o\.o\([^\)]*\)').findall(content)
    #logger.debug(result)
    for line in result:
        match = re.match(
            r'o\.o\((true|false),(\d+),(\d+),(\d+),\'\[([^\]]*)\]\',\s*\'([^\']+)\',\s*\'([^\']+)\',\s*\'([^\']*)\',(\d+),(\d+),(\d+)\)',
            line)
        #logger.debug(match.group())
        board = {}
        board['_id'] = int(match.group(3))
        board['name'] = match.group(6)
        board['name2'] = match.group(7)
        board['moderators'] = match.group(8)
        board['section_name'] = match.group(5)
        board['unkown1'] = int(match.group(4))
        board['unkown2'] = int(match.group(10))
        board['online'] = int(match.group(11))
        board['is_folder'] = match.group(1)
        board['post_count'] = int(match.group(9))
        board['section_id'] = id
        board['parent_id'] = 0
        boards.append(board)
        # logger.debug(board)
        if board['is_folder'] == 'true':
            sub_boards = browseBoard(board['name'], board['_id'], id)
            boards.extend(sub_boards)
    return boards
def fetch_new_articles(board, start_page=0):
    url = config.base_url + '/bbsdoc.php?board=' + board['name']
    html = newsm_common.request_get(url, 'GB18030', 20, 10)
    if html is None:
        logger.error('    URL request failed: ' + url)
        return
    # logger.debug(html)

    # docWriter('Python',284,96499,0,0,3218,96528,'/groups/comp.faq/Python',1,1);
    result = re.compile('docWriter\(\'' + board['name'] + '\',(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),\'([^\']+)\',(\d+),(\d+)\)').search(html)
    if result is None:
        logger.error('Not matched')
        return
    # logger.debug(result.group())
    pages = int(result.group(5))
    boardId = int(result.group(1))

    tb_article = config.mongo_db['article_' + str(boardId)]
    skipped_count = 0
    new_articles = 0
    logger.info('=== {}, {}'.format(boardId, board['name']))

    if start_page > 0 and start_page < pages:
        pages = start_page
    for page in range(pages, 1, -1):
        logger.info('    {}, {}, P{}'.format(boardId, board['name'], page))
        articles = fetch_articles_list(board['name'], boardId, page)
        for article in articles:
            # timeArray = time.localtime(article['created_at'])
            # logger.debug(time.strftime("%Y-%m-%d %H:%M:%S", timeArray) + ': ' + str(article['_id']) + ',' + article['title'])
            dummy = tb_article.find_one({'_id': article['_id']})
            if dummy is None or dummy['content'] == '':
                # Fetch the rest profiles for article
                fetch_article(article)
                if article['content'] != '':
                    tb_article.save(article)
                    new_articles += 1
                    # Add or update user
                    if ('author' in article) and (not article['author'] == ''):
                        newsm_user.update_user(article['author'])
                else:
                    skipped_count += 1
            else:
                skipped_count += 1
                #logger.debug('skip: ' + str(article['_id']))

        if (skipped_count > 30):
            break

    logger.info('    New articles: {}'.format(new_articles))
def fetch_article(article):
    article['title'] = newsm_common.remove_emoji(article['title'].strip())

    realId = str(article['_id'])[len(str(article['board_id'])) + 1:]
    url = config.base_url + '/bbscon.php?bid=' + str(article['board_id']) + '&id=' + realId
    html = newsm_common.request_get(url, 'GB18030', 20, 10)
    if html is None:
        logger.error('    URL request failed: ' + url)
        return
    # logger.info(html)
    # logger.info('发信人:  [FROM: 60.191.227.*]\r[m\n');o.h(0);o.t();
    # logger.info('发信人:  [FROM: 60.191.227.*]\r[m\n');attach('test.zip', 4227, 2059);o.h(0);o.t();
    result = re.compile(
        '(prints\(\'(.*)\'\);(attach\(\'([^\']+)\',\s*(\d+),\s*(\d+)\);){0,}o\.h\(0\);o\.t\(\);)').search(html)
    if result is None:
        logger.debug('    Not matched: {}'.format(html))
        article['content'] = ''
        article['updated_at'] = int(time.time())
        article['attachments'] = []
        return
    article['content'] = result.group(2)
    # simplifiy the content
    article['content'] = re.sub(r'\\n', '\n', article['content'])
    article['content'] = re.sub(r'\\r\[[;\d]{0,8}m', '', article['content'])
    article['content'] = re.sub(r'\\(/|"|\')', r'\1', article['content'])
    article['content'] = newsm_common.remove_emoji(article['content'])

    # extract the IP
    ip = extract_ip_from_article(article['content'])
    if (not ip is None):
        article['ip'] = ip
    else:
        article['ip'] = None

    article['updated_at'] = int(time.time())
    article['attachments'] = []
    # If there are attachments
    if not result.group(3) is None:
        # group 3 only match one occurence, so here apply the matching on group 1 again
        result = re.compile('attach\(\'([^\']+)\',\s*(\d+),\s*(\d+)\);').findall(result.group(1))
        if len(result) > 0:
            # logger.info(result)
            for line in result:
                attachment = {
                    'name':line[0].strip(),
                    'size':int(line[1]),
                    'id':int(line[2])
                }
                article['attachments'].append(attachment)
Exemple #6
0
def fetch_user(name):
    name = name.strip()
    url = config.base_url + '/bbsqry.php?userid=' + name
    html = newsm_common.request_get(url, 'GB18030', 20, 10)
    # logger.info(html)
    if html is None:
        logger.error('URL request failed: ' + url)
        return None
    '''<tr><td>该用户不存在</td></tr>'''
    result = re.compile('<tr><td>该用户不存在</td></tr>').search(html)
    if result is not None:
        user = {
            '_id': name.lower(),
            'name': name,
            'nick': '用户不存在',
            'logins': 0,
            'posts': 0,
            'last_login': '',
            'ip': '',
            'last_active': '',
            'life': 0,
            'title': '',
            'updated_at': int(time.time()),
            'next_update': int(time.time()) + 3600 * 240
        }
        return user

    result = re.compile('<pre>\s*([\s\S]*)\s*</pre>').search(html)
    if result is None:
        logger.error('    Not matched: {}'.format(html))
        return None
    # logger.info(result.group(1))
    result2 = re.compile(
        '([^(]+)\(([\s\S]*)\) 共上站 (\d+) 次,发表过 (\d+) 篇文章\s+上次在\s+\[(.*)\] 从 \[(.*)\] 到本站一游。(?:积分: \[\d+\])?\s+离线时间\s*\[(.*)\] 信箱: \[(.*)\] 生命力: \[(-?\d+)\] 身份: \[(.*)\]。'
    ).search(result.group(1))
    if result2 is None:
        logger.error('Not matched(2)')
        logger.debug(result.group())
        return None
    # logger.info(result2.group())
    user = {
        '_id': result2.group(1).strip().lower(),
        'name': result2.group(1).strip(),
        'nick': result2.group(2).strip(),
        'logins': int(result2.group(3)),
        'posts': int(result2.group(4)),
        'last_login': result2.group(5),
        'ip': result2.group(6),
        'last_active': result2.group(7),
        'life': int(result2.group(9)),
        'title': result2.group(10),
        'updated_at': int(time.time()),
        'next_update': int(time.time()) + 3600 * 72
    }

    result3 = re.compile('\'dp1\'\);\s+prints\(\'(.*)\'\);\/\/-->').search(
        html)
    # logger.debug(result3.group())
    if result3 is not None:
        user['signature'] = result3.group(1).strip()
        user['signature'] = re.sub(r'\\n', '\n', user['signature'])
        user['signature'] = re.sub(r'\\r\[[;\d]{0,12}m', '', user['signature'])
        user['signature'] = re.sub(r'\\(/|"|\')', r'\1', user['signature'])
        user['signature'] = user['signature'].strip()
    else:
        user['signature'] = None

    return user