Example #1
0
class PttIpAsnCrawler(object):
    def __init__(self, arguments: Dict):
        self.db_input = arguments['database'] or False
        self.ip_list = ('' if self.db_input else arguments['ip_list'])

        config_path = (arguments['config_path'] or 'config.ini')
        self.config = load_config(config_path)
        self.database_config = self.config['Database']

        self._init_database()

        if arguments['verbose']:
            logging.getLogger().setLevel(logging.DEBUG)

    def _init_database(self):
        self.db = PttDatabase(dbtype=self.database_config['Type'],
                              dbname=self.database_config['Name'])
        self.db_session = self.db.get_session()

    def _get_ip_list(self):
        if self.db_input:
            return list(
                map(lambda ipasn: str(ipasn.ip),
                    self.db_session.query(IpAsn).order_by(IpAsn.asn).all()))
        else:
            return self.ip_list.split(',')

    @log('Output_Database')
    def _output_database(self, result: List[Dict[str, str]]):
        self.db.bulk_update(self.db_session, IpAsn, result)

    @log()
    def crawling(self):
        ip_list = self._get_ip_list()

        ip_result = []
        for ip in ip_list:
            if ip:
                net = Net(ip)
                obj = IPASN(net)
                result = {'ip': ip}
                result.update(obj.lookup())
                result['asn_date'] = datetime.strptime(result['asn_date'],
                                                       '%Y-%m-%d')
                ip_result.append(result)

                if len(ip_result) % 100 == 0:
                    self._output_database(ip_result)
                    ip_result = []

        self._output_database(ip_result)
Example #2
0
    def _init_helper(self, arguments: Dict[str, str]):
        config_path = (arguments['config_path']
                       if arguments['config_path'] else 'config.ini')
        self.config = load_config(config_path)
        self.file_format = ExportFormat[arguments['format']]
        self.output_folder = arguments['output_folder']
        self.output_prefix = arguments['output_prefix']

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        self.db = PttDatabase(dbtype=self.config['Database']['Type'],
                              dbname=self.config['Database']['Name'])
        self.db_session = self.db.get_session()
Example #3
0
    def __init__(self, arguments: Dict[str, str]):
        config_path = (arguments['config_path']
                       if arguments['config_path']
                       else 'config.ini')

        self.start_date, self.end_date = arguments['date_range']
        self.board_name = arguments['board_name']
        self.file_format = arguments['format']

        self.config = load_config(config_path)
        self.output_folder = arguments['output_folder']
        self.output_prefix = arguments['output_prefix']

        self.db = PttDatabase(dbtype=self.config['Database']['Type'],
                              dbname=self.config['Database']['Name'])
        self.db_session = self.db.get_session()
Example #4
0
class QueryHelper(object):
    def __init__(self, arguments: Dict[str, str]):
        config_path = (arguments['config_path']
                       if arguments['config_path']
                       else 'config.ini')

        self.start_date, self.end_date = arguments['date_range']
        self.board_name = arguments['board_name']
        self.file_format = arguments['format']

        self.config = load_config(config_path)
        self.output_folder = arguments['output_folder']
        self.output_prefix = arguments['output_prefix']

        self.db = PttDatabase(dbtype=self.config['Database']['Type'],
                              dbname=self.config['Database']['Name'])
        self.db_session = self.db.get_session()

    @log()
    def _get_export_rows(self):
        rows = [['Type', 'Board', 'Start date',
                 'End date', 'TW Ip', 'Not TW Ip']]

        tw_ip_label = case(value=IpAsn.asn_country_code,
                           whens={'TW': True},
                           else_=False).label("TW_IP")

        article_res = self.db_session.query(Article, ArticleHistory, tw_ip_label) \
            .join(ArticleHistory, ArticleHistory.article_id == Article.id) \
            .join(Board, Board.id == Article.board_id) \
            .order_by(ArticleHistory.id) \
            .group_by(Article.id) \
            .join(IpAsn, IpAsn.ip == Article.post_ip) \
            .filter(Board.name == self.board_name).all()

        article_tw_ip = sum(1 for _, _, tw_ip in article_res
                            if tw_ip == True)
        article_not_tw_ip = sum(1 for _, _, tw_ip in article_res
                                if tw_ip == False)
        rows.append(['Article', self.board_name,
                     str(self.start_date or ''), str(self.end_date or ''), article_tw_ip or '0', article_not_tw_ip or '0'])

        article_history_id_list = []
        for res in article_res:
            _, history, _ = res
            article_history_id_list.append(history.id)

        push_res = self.db_session.query(Push, tw_ip_label) \
            .join(IpAsn, IpAsn.ip == Push.push_ip) \
            .filter(Push.article_history_id.in_(article_history_id_list)).all()

        push_tw_ip = sum(1 for _,  tw_ip in push_res
                         if tw_ip == True)
        push_not_tw_ip = sum(1 for _,  tw_ip in push_res
                             if tw_ip == False)
        rows.append(['Push', self.board_name,
                     str(self.start_date or ''), str(self.end_date or ''), push_tw_ip or '0', push_not_tw_ip or '0'])

        return rows

    def _print_rows(self):
        data = self._get_export_rows()
        for idx, row in enumerate(data):
            print('{:8} | {:16} | {:20} | {:20} | {:5} | {:8}'.format(
                *map(str, row)))
            if idx == 0:
                print(
                    '---------+------------------+----------------------+----------------------+-------+----------')

    def _export_ods(self):
        data = {'Query': self._get_export_rows()}
        output_filename = 'Ptt_query_{export_datetime}'.format(
            export_datetime=datetime.now().strftime('%Y-%m-%d'))
        output_path = os.path.join(
            self.output_folder, '{filename}.ods'.format(filename=output_filename))
        save_data(output_path, data)

    def _export_csv(self):
        data = self._get_export_rows()
        output_filename = 'Ptt_query_{export_datetime}'.format(
            export_datetime=datetime.now().strftime('%Y-%m-%d'))
        csv_path = os.path.join(
            self.output_folder, '{filename}.csv'.format(filename=output_filename))
        with open(csv_path, 'w') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter=',')
            for row in data:
                csvwriter.writerow(row)

    def go(self):
        if self.file_format == 'console':
            self._print_rows()
        elif self.file_format == 'ods':
            self._export_ods()
        elif self.file_format == 'csv':
            self._export_csv()
Example #5
0
 def _init_database(self):
     self.db = PttDatabase(dbtype=self.database_config['Type'],
                           dbname=self.database_config['Name'])
     self.db_session = self.db.get_session()
Example #6
0
class PttUserCrawler(object):
    PTT_WEB_URL = 'http://term.ptt.cc/'

    def __init__(self, arguments: Dict):
        self.db_input = arguments['database'] or False
        self.id_list = ('' if self.db_input else arguments['id'])

        config_path = (arguments['config_path'] or 'config.ini')

        self._init_config(config_path)
        self._init_database()
        self._init_browser()

        self.ptt_browser_buffer_logger = logging.getLogger(__name__ + '.log')

        self.json_prefix = arguments['json_prefix']
        self.debug_mode = arguments['debug_mode']
        if arguments['verbose']:
            logging.getLogger().setLevel(logging.DEBUG)

    def _init_config(self, config_path: str):
        self.config = load_config(config_path)
        if self.config['PttUser']['Output'] == 'both':
            self.json_output = True
            self.database_output = True
        elif self.config['PttUser']['Output'] == 'database':
            self.json_output = False
            self.database_output = True
        elif self.config['PttUser']['Output'] == 'json':
            self.json_output = True
            self.database_output = False
        else:
            self.json_output = False
            self.database_output = False

    def _init_database(self):
        self.db = PttDatabase(dbtype=self.config['Database']['Type'],
                              dbname=self.config['Database']['Name'])
        self.db_session = self.db.get_session()

    def _init_browser(self):
        if sys.platform.startswith('linux'):
            platform = 'linux'
            exe_filename = 'chromedriver'
        elif sys.platform.startswith('win'):
            platform = 'windows'
            exe_filename = 'chromedriver.exe'
        else:
            platform = 'mac'
            exe_filename = 'chromedriver'

        self.webdriver_path = os.path.join(
            self.config['PttUser']['WebdriverFolder'], platform, exe_filename)
        self.chrome_options = ChromeOptions()
        self.chrome_options.add_argument('--headless')

    def _get_id_list(self) -> List[str]:
        if self.db_input:
            return list(
                map(
                    lambda user: user.username,
                    self.db_session.query(User).order_by(
                        User.login_times, User.id).all()))
        else:
            return self.id_list.split(',')

    def _output_json(self, result: Dict[str, object], count):
        json_path = '{prefix}user_{count}.json'.format(prefix=self.json_prefix,
                                                       count=count)
        with open(json_path, 'w') as jsonfile:
            json.dump(result, jsonfile, sort_keys=True, indent=4)

    @log('Output_Database')
    def _output_database(self, result: List[Dict[str, object]]):
        for record in result:
            user, is_new_user = self.db.get_or_create(
                self.db_session, User, {'username': record['username']}, {
                    'username': record['username'],
                    'login_times': int(record['login_times']),
                    'valid_article_count': int(record['valid_article_count'])
                })
            if not is_new_user:
                user.login_times = int(record['login_times'])
                user.valid_article_count = int(record['valid_article_count'])
                self.db_session.commit()

            last_login_datetime = datetime.datetime.strptime(
                record['last_login_datetime'], '%m/%d/%Y %H:%M:%S %a')
            last_record = UserLastRecord(
                last_login_datetime=last_login_datetime,
                last_login_ip=record['last_login_ip'])
            last_record.user_id = user.id
            if record['last_login_ip']:
                _, _ = self.db.get_or_create(self.db_session, IpAsn,
                                             {'ip': record['last_login_ip']}, {
                                                 'ip': record['last_login_ip'],
                                                 'asn': None,
                                                 'asn_cidr': None,
                                                 'asn_country_code': None,
                                                 'asn_date': None,
                                                 'asn_description': None,
                                                 'asn_raw': None,
                                                 'asn_registry': None
                                             })

            self.db_session.add(last_record)
            self.db_session.commit()

    def _output(self, result: Dict[str, object], count):
        if self.json_output:
            self._output_json(result, count)
        if self.database_output:
            self._output_database(result)

    def _login_ptt(self, browser, userid, userpwd):
        browser.connect(self.PTT_WEB_URL)
        # Ptt login
        browser.send_keys(userid)
        browser.send_keys(userpwd)

        # 踢掉重複登入 或 刪除密碼嘗試錯誤記錄
        buffer = browser.get_buffer()
        while u"主功能表" not in buffer:
            browser.send_keys('')
            buffer = browser.get_buffer()

    @log()
    def crawling(self):

        delaytime = float(self.config['PttUser']['Delaytime'])
        userid = self.config['PttUser']['UserId']
        userpwd = self.config['PttUser']['UserPwd']

        id_list = self._get_id_list()

        crawler_result = []

        with PttBrowser(self.webdriver_path,
                        self.chrome_options,
                        debug_mode=self.debug_mode) as browser:

            browser.ACT_DELAY_TIME = delaytime

            self._login_ptt(browser, userid, userpwd)

            # 轉到 Talk -> Query
            browser.send_keys('T')

            id_queue = id_list.copy()
            count = 1
            err_count = 0
            while len(id_queue) > 0:
                for user_id in id_list:
                    try:
                        browser.send_keys('Q').send_keys(user_id)
                        buffer = browser.get_buffer()

                        self.ptt_browser_buffer_logger.debug(
                            'Buffer:\n%s', buffer)

                        pattern = r"[\w\W]*《登入次數》(\d*)\D*次\D*《有效文章》\D*(\d*)[\w\W]*《上次上站》\D*([\d]{1,2}\/[\d]{1,2}\/[\d]{4}\W*[\d]{1,2}:\W*[\d]{1,2}:\W*[\d]{1,2}\W*\w*)\D*《上次故鄉》([\d.]*)"
                        pat = re.compile(pattern)
                        search_result = pat.match(buffer)

                        if search_result:
                            login_times = search_result.group(1)
                            valid_article_count = search_result.group(2)
                            last_login_datetime = search_result.group(3)
                            last_login_ip = search_result.group(4)

                            crawler_result.append({
                                'username':
                                user_id,
                                'login_times':
                                login_times,
                                'valid_article_count':
                                valid_article_count,
                                'last_login_datetime':
                                last_login_datetime,
                                'last_login_ip':
                                last_login_ip
                            })

                            if len(crawler_result) % 100 == 0:
                                self._output(crawler_result, count)
                                count += 1
                                crawler_result = []
                        else:
                            logging.error('User "%s" has error', user_id)
                            self.ptt_browser_buffer_logger.error(
                                'Buffer:\n%s', buffer)

                        browser.send_keys('')
                        id_queue.remove(user_id)
                    except KeyboardInterrupt:
                        id_queue = []
                        break
                    except PttDisconnectException as e:
                        err_count += 1
                        if err_count == 3:
                            raise e

                        browser.send_keys('')
                        self._login_ptt(browser, userid, userpwd)
                        browser.send_keys('T')
                        continue

                self._output(crawler_result, count)
                count += 1
Example #7
0
class PttArticleIndexCrawler(object):
    PTT_URL = 'https://www.ptt.cc'
    PTT_Board_Format = '/bbs/{board}/index{index}.html'

    def __init__(self, arguments: Dict[str, str]):
        def get_default_start_url(board_name):
            last_index = self._getLastPage(board_name)
            return last_index, (
                self.PTT_URL +
                self.PTT_Board_Format.format(board_name, last_index))

        config_path = (arguments['config_path'] or 'config.ini')

        self._init_config(config_path)
        self._init_database()

        self.board_name = arguments['board_name']
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
        }
        self.cookies = {'over18': '1'}

        self.before = arguments['before']
        logging.info('{}'.format('Before' if self.before else 'After'))
        if arguments['before']:
            if arguments['index']:
                self.end_index = arguments['index']
            else:
                self.end_index = self._getDBLastPage()
                if not self.end_index:
                    self.end_index = self._getLastPage()
            self.start_index = 1
        else:
            if arguments['index']:
                self.start_index = arguments['index']
            else:
                self.start_index = self._getDBLastPage()
                if not self.start_index:
                    self.start_index = self._getLastPage()
            self.end_index = self._getLastPage()

        self.start_url = (self.PTT_URL + self.PTT_Board_Format.format(
            board=self.board_name, index=self.end_index))

    def _init_config(self, config_path: str):
        self.config = load_config(config_path)
        self.article_config = self.config['PttArticle']
        self.database_config = self.config['Database']

        self.NEXT_PAGE_DELAY_TIME = float(
            self.article_config['NextPageDelaytime'])

    def _init_database(self):
        self.db = PttDatabase(dbtype=self.database_config['Type'],
                              dbname=self.database_config['Name'])
        self.db_session = self.db.get_session()

    def _getDBLastPage(self):
        board, _ = self.db.get_or_create(self.db_session, Board,
                                         {'name': self.board_name},
                                         {'name': self.board_name})
        index_func = (func.min if self.before else func.max)
        article_index_res = self.db_session \
            .query(ArticleIndex.board_id, index_func(ArticleIndex.index)) \
            .group_by(ArticleIndex.board_id) \
            .filter(ArticleIndex.board_id == board.id) \
            .all()

        if article_index_res and len(article_index_res) > 0:
            for _, index in article_index_res:
                return index
        else:
            return None

    def _getLastPage(self, timeout=3):
        """Ref: https://github.com/jwlin/ptt-web-crawler/blob/f8c04076004941d3f7584240c86a95a883ae16de/PttWebCrawler/crawler.py#L189"""
        resp = requests.get(
            url=self.PTT_URL +
            self.PTT_Board_Format.format(board=self.board_name, index=''),
            headers=self.headers,
            cookies=self.cookies,
            timeout=timeout)
        self.cookies = resp.cookies
        self.cookies['over18'] = '1'
        content = resp.content.decode('utf-8')
        first_page = re.search(r'href="/bbs/\w+/index(\d+).html">‹',
                               content)
        if first_page is None:
            return 1
        return int(first_page.group(1)) + 1

    @log('Output_Database')
    def _output_database(self, result: List[Dict[str, object]]):
        self.db.bulk_update(self.db_session, ArticleIndex, result)

    def crawling(self):
        board = self.db.get(self.db_session, Board, {'name': self.board_name})

        logging.info('Index range: %d ~ %d', self.start_index, self.end_index)
        while self.end_index >= self.start_index:
            ptt_index_url = (self.PTT_URL + self.PTT_Board_Format).format(
                board=self.board_name, index=self.end_index)
            logging.info('Processing index: %d, Url = %s', self.end_index,
                         ptt_index_url)

            resp = requests.get(url=ptt_index_url,
                                headers=self.headers,
                                cookies=self.cookies,
                                timeout=None)
            self.cookies = resp.cookies
            self.cookies['over18'] = '1'

            if resp.status_code != 200:
                logging.error(
                    'Processing index error, status_code = %d, Url = %s',
                    resp.status_code, ptt_index_url)
                resp.raise_for_status()

            soup = BeautifulSoup(resp.text, 'html.parser')
            divs = soup.find("div",
                             "r-list-container action-bar-margin bbs-screen")
            children = divs.findChildren("div", recursive=False)

            article_list = []

            for div in children:
                # ex. link would be <a href="/bbs/PublicServan/M.1127742013.A.240.html">Re: [問題] 職等</a>
                try:
                    if 'r-list-sep' in div['class']:
                        break
                    elif 'r-ent' in div['class']:
                        try:
                            href = div.find('a')['href']
                            link = self.PTT_URL + href
                            article_id = re.sub('\.html', '',
                                                href.split('/')[-1])

                            article_list.append({
                                'web_id': article_id,
                                'board_id': board.id,
                                'index': self.end_index
                            })

                            logging.debug('Processing article: %s, Url = %s',
                                          article_id, link)
                        except:
                            pass
                except Exception as e:
                    logging.exception('Processing article error, Url = %s',
                                      link)

            self._output_database(article_list)

            self.end_index -= 1
            time.sleep(self.NEXT_PAGE_DELAY_TIME)
Example #8
0
class PttArticleCrawler:

    PTT_URL = 'https://www.ptt.cc'
    PTT_Board_Format = '/bbs/{board}/index{index}.html'
    PTT_Article_Format = '/bbs/{board}/{web_id}.html'
    DELAY_TIME = 1.0
    NEXT_PAGE_DELAY_TIME = 5.0

    @log('Initialize')
    def __init__(self, arguments: Dict):

        config_path = (arguments['config_path'] or 'config.ini')

        self._init_config(config_path)
        self._init_database()

        self.board = arguments['board_name']
        self.timeout = None
        # self.timeout = float(self.article_config['Timeout'])

        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
        }
        self.cookies = {'over18': '1'}

        self.start_date = arguments['start_date']
        self.from_database = arguments['database']
        if not self.from_database:
            self.start_index, self.end_index = (
                arguments['index'] if arguments['index'] else
                (1, self.getLastPage(self.board, self.timeout)))
        else:
            self.start_index, self.end_index = (0, 0)

        self.upgrade_action = arguments['upgrade']

        self.json_folder = arguments['json_folder']
        self.json_prefix = arguments['json_prefix']

        if arguments['verbose']:
            logging.getLogger().setLevel(logging.DEBUG)

    def _init_config(self, config_path: str):
        self.config = load_config(config_path)
        self.article_config = self.config['PttArticle']
        self.database_config = self.config['Database']

        self.DELAY_TIME = float(self.article_config['Delaytime'])
        self.NEXT_PAGE_DELAY_TIME = float(
            self.article_config['NextPageDelaytime'])
        self.VERSION_ROTATE = int(self.article_config['VersionRotate']) or 30

        self.json_output = False
        self.database_output = False
        if 'Output' in self.article_config:
            if self.article_config['Output'] == 'both':
                self.json_output = True
                self.database_output = True
            elif self.article_config['Output'] == 'database':
                self.json_output = False
                self.database_output = True
            elif self.article_config['Output'] == 'json':
                self.json_output = True
                self.database_output = False

    def _init_database(self):
        self.db = PttDatabase(dbtype=self.database_config['Type'],
                              dbname=self.database_config['Name'])
        self.db_session = self.db.get_session()

    def _output_json(self, result: Dict[str, object], index):
        json_name = '{prefix}{board}_{index}.json'.format(
            prefix=self.json_prefix, board=self.board, index=index)
        json_path = os.path.join(self.json_folder, json_name)
        with codecs.open(json_path, 'w', encoding='utf-8') as jsonfile:
            json.dump(result,
                      jsonfile,
                      sort_keys=True,
                      indent=4,
                      ensure_ascii=False)

    def _output_index_to_database(self, result: List[tuple]):
        board, _ = self.db.get_or_create(self.db_session, Board,
                                         {'name': self.board},
                                         {'name': self.board})
        # board = self.db.get(self.db_session,
        #                     Board,
        #                     {'name': self.board})
        index_list = []
        for web_id, link, index in result:
            logging.debug('web_id = %s, link = %s, index = %d, board.id = %d',
                          web_id, link, index, board.id)
            index_list.append({
                'web_id': web_id,
                'board_id': board.id,
                'index': index
            })
        self.db.bulk_update(self.db_session, ArticleIndex, index_list)

    @log('Output_Database')
    def _output_database(self, result: List[Dict[str, object]]):
        def parser_push_ipdatetime(push_ipdatetime):
            logging.debug('parser_push_ipdatetime(%s)', push_ipdatetime)
            if push_ipdatetime:
                match = re.search(r'([\d.]*)\W?(\d{2}\/\d{2}\ \d{2}:\d{2})',
                                  push_ipdatetime)
                if match:
                    push_ip = match.group(1)
                    push_datetime = datetime.strptime(match.group(2),
                                                      "%m/%d %M:%S")

                    return push_ip, push_datetime
            logging.warning('push_ipdatetime %s search failed',
                            push_ipdatetime)
            return None, None

        def parse_author(author):
            logging.debug('parse_author(%s)', author)
            if author:
                match = re.search(r'([\S]*)\D\((.*)\)', author)
                if match:
                    return match.group(1)
            return author

        for record in result:
            try:
                author_username = parse_author(record['author'])
                if not author_username:
                    logging.warning('author is empty, record = %s', record)
                    author_username = ''
                author_conditon = {'username': author_username}
                author_values = {
                    'username': author_username,
                    'login_times': 0,
                    'valid_article_count': 0
                }
                if not self.upgrade_action:
                    article = self.db.get(self.db_session, Article,
                                          {'web_id': record['article_id']})
                    if article:
                        continue

                user, _ = self.db.get_or_create(self.db_session,
                                                User,
                                                author_conditon,
                                                author_values,
                                                auto_commit=False)
                board, _ = self.db.get_or_create(self.db_session,
                                                 Board,
                                                 {'name': record['board']},
                                                 {'name': record['board']},
                                                 auto_commit=False)

                try:
                    record['date'] = datetime.strptime(record['date'],
                                                       '%a %b %d %H:%M:%S %Y')
                except:
                    record['date'] = None

                article, is_new_article = self.db.get_or_create(
                    self.db_session,
                    Article, {'web_id': record['article_id']}, {
                        'web_id': record['article_id'],
                        'user_id': user.id,
                        'board_id': board.id,
                        'post_datetime': record['date'],
                        'post_ip': record['ip']
                    },
                    auto_commit=False)

                if record['ip']:
                    _, _ = self.db.get_or_create(self.db_session,
                                                 IpAsn, {'ip': record['ip']}, {
                                                     'ip': record['ip'],
                                                     'asn': None,
                                                     'asn_cidr': None,
                                                     'asn_country_code': None,
                                                     'asn_date': None,
                                                     'asn_description': None,
                                                     'asn_raw': None,
                                                     'asn_registry': None
                                                 },
                                                 auto_commit=False)
                if not is_new_article:
                    article.history[0].end_at = datetime.now()
                    self.db_session.flush()

                history = self.db.create(self.db_session,
                                         ArticleHistory, {
                                             'article_id': article.id,
                                             'title': record['article_title'],
                                             'content': record['content'],
                                             'start_at': datetime.now(),
                                             'end_at': datetime.now()
                                         },
                                         auto_commit=False)

                # 更新到最近的文章歷史記錄推文
                push_list = []
                for (floor, message) in enumerate(record['messages']):
                    push_userid = message['push_userid']
                    if not push_userid:
                        logging.warning('push_userid is empty, message = %s',
                                        message)
                        push_userid = ''
                    push_user_condition = {'username': push_userid}
                    push_user_values = {
                        'username': push_userid,
                        'login_times': 0,
                        'valid_article_count': 0
                    }
                    push_user, _ = self.db.get_or_create(self.db_session,
                                                         User,
                                                         push_user_condition,
                                                         push_user_values,
                                                         auto_commit=False)
                    push_ip, push_datetime = parser_push_ipdatetime(
                        message['push_ipdatetime'])

                    push_list.append(
                        Push(article_history_id=history.id,
                             floor=(floor + 1),
                             push_tag=message['push_tag'],
                             push_user_id=push_user.id,
                             push_content=message['push_content'],
                             push_ip=push_ip,
                             push_datetime=push_datetime))
                    if push_ip:
                        _, _ = self.db.get_or_create(
                            self.db_session,
                            IpAsn, {'ip': push_ip}, {
                                'ip': push_ip,
                                'asn': None,
                                'asn_cidr': None,
                                'asn_country_code': None,
                                'asn_date': None,
                                'asn_description': None,
                                'asn_raw': None,
                                'asn_registry': None
                            },
                            auto_commit=False)

                self.db.bulk_insert(self.db_session,
                                    push_list,
                                    auto_commit=False)

                article = self.db.get(self.db_session, Article,
                                      {'id': article.id})

                if len(article.history) >= self.VERSION_ROTATE:
                    for h in article.history[self.VERSION_ROTATE:]:
                        self.db_session.delete(h)
                    self.db_session.flush()

                self.db_session.commit()
            except:
                logging.exception('record = %s', record)

    def parse(self, link, article_id, board, timeout=3):
        """Ref: https://github.com/jwlin/ptt-web-crawler/blob/f8c04076004941d3f7584240c86a95a883ae16de/PttWebCrawler/crawler.py#L99"""
        resp = requests.get(url=link,
                            headers=self.headers,
                            cookies=self.cookies,
                            verify=True,
                            timeout=timeout)
        self.cookies = resp.cookies
        self.cookies['over18'] = '1'
        if resp.status_code != 200:
            return {"error": "invalid url"}
            # return json.dumps({"error": "invalid url"}, sort_keys=True, ensure_ascii=False)
        soup = BeautifulSoup(resp.text, 'html.parser')
        main_content = soup.find(id="main-content")
        metas = main_content.select('div.article-metaline')
        author = ''
        title = ''
        date = ''
        if metas:
            author = (metas[0].select('span.article-meta-value')[0].string
                      if metas[0].select('span.article-meta-value')[0] else
                      author)
            title = (metas[1].select('span.article-meta-value')[0].string if
                     metas[1].select('span.article-meta-value')[0] else title)
            date = (metas[2].select('span.article-meta-value')[0].string
                    if metas[2].select('span.article-meta-value')[0] else date)

            # remove meta nodes
            for meta in metas:
                meta.extract()
            for meta in main_content.select('div.article-metaline-right'):
                meta.extract()
        else:
            logging.info('metas is None in link %s', link)
            transcription = main_content.find(text=re.compile(u'※ 轉錄者:'))
            if transcription:
                # 轉錄文章
                match = re.search(
                    r'\W(\w+)\W\([0-9]*\.[0-9]*\.[0-9]*\.[0-9]*\),\W([0-9]+\/[0-9]+\/[0-9]+\W[0-9]+:[0-9]+:[0-9]+)',
                    transcription)
                if match:
                    author = match.group(1)
                    date = datetime.strptime(match.group(2),
                                             "%m/%d/%Y %H:%M:%S")
                    date = date.strftime('%a %b %d %H:%M:%S %Y')
            else:
                logging.info('Excuse me WTF!?')
                raise PostException('此文章被編輯過,解析出現問題。')

        # remove and keep push nodes
        pushes = main_content.find_all('div', class_='push')
        for push in pushes:
            push.extract()

        try:
            ip = main_content.find(text=re.compile(u'※ 發信站:'))
            ip = re.search('[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*', ip).group()
        except:
            ip = None

        # 移除 '※ 發信站:' (starts with u'\u203b'), '◆ From:' (starts with u'\u25c6'), 空行及多餘空白
        # 保留英數字, 中文及中文標點, 網址, 部分特殊符號
        filtered = [
            v for v in main_content.stripped_strings
            if v[0] not in [u'※', u'◆'] and v[:2] not in [u'--']
        ]
        expr = re.compile((
            r'[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\s\w:/-_.?~%()]'
        ))
        for i in range(len(filtered)):
            filtered[i] = re.sub(expr, '', filtered[i])

        filtered = [_f for _f in filtered if _f]  # remove empty strings
        # remove last line containing the url of the article
        filtered = [x for x in filtered if article_id not in x]
        content = ' '.join(filtered)
        content = re.sub(r'(\s)+', ' ', content)
        # print 'content', content

        # push messages
        p, b, n = 0, 0, 0
        messages = []
        for push in pushes:
            if not push.find('span', 'push-tag'):
                continue
            push_tag = (push.find('span', 'push-tag').string
                        or '').strip(' \t\n\r')
            push_userid = (push.find('span', 'push-userid').string
                           or '').strip(' \t\n\r')
            # if find is None: find().strings -> list -> ' '.join; else the current way
            push_content = push.find('span', 'push-content').strings
            push_content = (' '.join(push_content)[1:]).strip(
                ' \t\n\r')  # remove ':'%a %b %d %H:%M:%S %Y
            push_ipdatetime = (push.find('span', 'push-ipdatetime').string
                               or '').strip(' \t\n\r')
            messages.append({
                'push_tag': push_tag,
                'push_userid': push_userid,
                'push_content': push_content,
                'push_ipdatetime': push_ipdatetime
            })
            if push_tag == u'推':
                p += 1
            elif push_tag == u'噓':
                b += 1
            else:
                n += 1

        # count: 推噓文相抵後的數量; all: 推文總數
        message_count = {
            'all': p + b + n,
            'count': p - b,
            'push': p,
            'boo': b,
            "neutral": n
        }

        # print 'msgs', messages
        # print 'mscounts', message_count

        # json data
        data = {
            'url': link,
            'board': board,
            'article_id': article_id,
            'article_title': title,
            'author': author,
            'date': date,
            'content': content,
            'ip': ip,
            'message_count': message_count,
            'messages': messages
        }
        # print 'original:', data
        return data
        # return json.dumps(data, sort_keys=True, ensure_ascii=False)

    def getLastPage(self, board, timeout=3):
        """Ref: https://github.com/jwlin/ptt-web-crawler/blob/f8c04076004941d3f7584240c86a95a883ae16de/PttWebCrawler/crawler.py#L189"""
        resp = requests.get(url='https://www.ptt.cc/bbs/' + board +
                            '/index.html',
                            headers=self.headers,
                            cookies=self.cookies,
                            timeout=timeout)
        self.cookies = resp.cookies
        self.cookies['over18'] = '1'
        content = resp.content.decode('utf-8')
        first_page = re.search(r'href="/bbs/\w+/index(\d+).html">&lsaquo;',
                               content)
        if first_page is None:
            return 1
        return int(first_page.group(1)) + 1

    @log()
    def crawling(self):
        logging.debug('Start date = %s', self.start_date)
        logging.debug('Start = %d, End = %d', self.start_index, self.end_index)
        logging.debug('From database = %s', str(self.from_database))
        if self.from_database:
            self._crawling_from_db()
        else:
            self._crawling_from_arg()

    @log()
    def _crawling_from_arg(self):
        last_page = self.end_index
        board, _ = self.db.get_or_create(self.db_session, Board,
                                         {'name': self.board},
                                         {'name': self.board})
        while last_page >= self.start_index:
            ptt_index_url = (self.PTT_URL + self.PTT_Board_Format).format(
                board=self.board, index=last_page)
            logging.debug('Processing index: %d, Url = %s', last_page,
                          ptt_index_url)

            resp = requests.get(url=ptt_index_url,
                                headers=self.headers,
                                cookies=self.cookies,
                                timeout=self.timeout)
            self.cookies = resp.cookies
            self.cookies['over18'] = '1'

            if resp.status_code != 200:
                logging.error(
                    'Processing index error, status_code = %d, Url = %s',
                    resp.status_code, ptt_index_url)
                resp.raise_for_status()

            soup = BeautifulSoup(resp.text, 'html.parser')
            divs = soup.find("div",
                             "r-list-container action-bar-margin bbs-screen")
            children = divs.findChildren("div", recursive=False)

            article_link_list = []
            for div in children:
                # ex. link would be <a href="/bbs/PublicServan/M.1127742013.A.240.html">Re: [問題] 職等</a>
                if 'r-list-sep' in div['class']:
                    break
                elif 'r-ent' in div['class']:
                    try:
                        href = div.find('a')['href']
                        link = self.PTT_URL + href
                        article_id = re.sub('\.html', '', href.split('/')[-1])
                        article_link_list.append((article_id, link, last_page))
                    except Exception as e:
                        logging.warning('%s href 404', div)
                else:
                    continue
            self._output_index_to_database(article_link_list)

            page_article_count = self.db_session.query(ArticleIndex) \
                .join(Article, Article.web_id == ArticleIndex.web_id) \
                .filter(ArticleIndex.board_id == board.id, ArticleIndex.index == last_page)\
                .count()

            if not self.upgrade_action and page_article_count == len(
                    article_link_list):
                pass
            else:
                article_list = []
                for article_id, link, _ in article_link_list:
                    try:
                        logging.info('Processing article: %s, Url = %s',
                                     article_id, link)

                        article_list.append(
                            self.parse(link, article_id, self.board,
                                       self.timeout))
                        time.sleep(self.DELAY_TIME)
                    except Exception as e:
                        logging.exception('Processing article error, Url = %s',
                                          link)

                len_article_list = len(article_list)
                if self.start_date:
                    tmp_article_list = []
                    for article in article_list:
                        try:
                            aritcle_date = datetime.strptime(
                                article['date'], '%a %b %d %H:%M:%S %Y')
                            if self.start_date <= aritcle_date:
                                tmp_article_list.append(article)
                        except Exception as e:
                            # 避免因為原文的日期被砍,導致無法繼續處理
                            len_article_list -= 1
                            logging.error('%s', e)
                            logging.error('article: %s , date format: %s',
                                          article['article_id'],
                                          article['date'])

                    if len(tmp_article_list) < len_article_list:
                        self.start_index = last_page
                        article_list = tmp_article_list

                if self.database_output:
                    self._output_database(article_list)

                if self.json_output:
                    self._output_json(article_list, last_page)

            last_page -= 1
            time.sleep(self.NEXT_PAGE_DELAY_TIME)

    @log()
    def _crawling_from_db(self):
        board, _ = self.db.get_or_create(self.db_session, Board,
                                         {'name': self.board},
                                         {'name': self.board})

        # exist_article_list = self.db_session \
        #     .query(Article.web_id) \
        #     .filter(Article.board_id == board.id).all()

        if self.upgrade_action:
            article_index_list = self.db_session \
                .query(ArticleIndex)\
                .filter(Article.board_id == board.id).all()
        else:
            article_index_list = self.db_session \
                .query(ArticleIndex) \
                .outerjoin(Article, ArticleIndex.web_id == Article.web_id) \
                .filter(Article.id.is_(None), ArticleIndex.board_id == board.id).all()
            # .filter(ArticleIndex.web_id.notin_(exist_article_list)).all()
        article_list = []
        count = 0
        for article_index in article_index_list:
            link = self.PTT_URL + \
                self.PTT_Article_Format.format(board=article_index.board.name,
                                               web_id=article_index.web_id)
            logging.debug('Processing Url = %s', link)
            article_id = article_index.web_id
            try:
                article_list.append(
                    self.parse(link, article_id, self.board, self.timeout))
                count += 1
                if count == 20:
                    self._output_database(article_list)
                    article_list = []
                    count = 0
            except Exception:
                pass
            finally:
                time.sleep(self.DELAY_TIME)

        if article_list:
            self._output_database(article_list)
Example #9
0
class PttExportHelper(object):
    def __init__(self):
        pass

    def _init_helper(self, arguments: Dict[str, str]):
        config_path = (arguments['config_path']
                       if arguments['config_path'] else 'config.ini')
        self.config = load_config(config_path)
        self.file_format = ExportFormat[arguments['format']]
        self.output_folder = arguments['output_folder']
        self.output_prefix = arguments['output_prefix']

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        self.db = PttDatabase(dbtype=self.config['Database']['Type'],
                              dbname=self.config['Database']['Name'])
        self.db_session = self.db.get_session()

    @log('Get Data')
    def _get_export_rows(self):
        article_rows = [[
            'Atricle.web_id', 'Article.board', 'Atricle.author',
            'Atricle.title', 'Atricle.cotent', 'Atricle.post_ip',
            'Atricle.post_ip.asn', 'Atricle.post_ip.asn_date',
            'Atricle.post_ip.asn_registry', 'Atricle.post_ip.asn_cidr',
            'Atricle.post_ip.asn_country_code',
            'Atricle.post_ip.asn_description', 'Article.post_datetime',
            'Article.last_modified_time'
        ]]
        push_rows = [[
            'Push.article_web_id', 'Push.username', 'Push.tag', 'Push.content',
            'Push.ip', 'Push.ip.asn', 'Push.ip.asn_cidr',
            'Push.ip.asn_country_code', 'Push.ip.asn_date',
            'Push.ip.asn_description', 'Push.ip.asn_registry', 'Push.datatime'
        ]]
        user_rows = [[
            'User.username', 'User.login_times', 'User.valid_article_count',
            'User.last_login_datetime', 'User.last_login_ip',
            'User.last_login_ip.asn', 'User.last_login_ip.asn_date',
            'User.last_login_ip.asn_registry', 'User.last_login_ip.asn_cidr',
            'User.last_login_ip.asn_country_code',
            'User.last_login_ip.asn_description'
        ]]

        data = OrderedDict()
        article_list = self.db_session.query(Article).order_by(
            Article.post_datetime).all()

        for article in article_list:
            article_row = [
                article.web_id, article.board.name or '', article.user.username
                or ''
            ]

            last_history = article.history[0]
            article_row += [
                last_history.title or '', last_history.content or ''
            ]

            article_ip_asn = self.db_session.query(IpAsn).filter_by(
                ip=article.post_ip).first()
            if article_ip_asn:
                article_row += [
                    article_ip_asn.ip or '', article_ip_asn.asn or '',
                    str(article_ip_asn.asn_date or ''),
                    article_ip_asn.asn_registry or '', article_ip_asn.asn_cidr
                    or '', article_ip_asn.asn_country_code or '',
                    article_ip_asn.asn_description or ''
                ]
            else:
                article_row += [article.post_ip or '', '', '', '', '', '', '']

            article_row += [
                str(article.post_datetime or ''),
                str(last_history.end_at or '')
            ]

            article_rows.append(article_row)

            for push in last_history.push_list:
                push_row = [article.web_id]
                push_row += [
                    push.user.username or '', push.push_tag or '',
                    push.push_content or ''
                ]
                push_ip_asn = self.db_session.query(IpAsn).filter_by(
                    ip=push.push_ip).first()
                if push_ip_asn:
                    push_row += [
                        push_ip_asn.ip or '', push_ip_asn.asn or '',
                        push_ip_asn.asn_cidr or '',
                        push_ip_asn.asn_country_code or '',
                        str(push_ip_asn.asn_date
                            or ''), push_ip_asn.asn_description or '',
                        push_ip_asn.asn_registry or ''
                    ]
                else:
                    push_row += [push.push_ip or '', '', '', '', '', '', '']

                if push.push_datetime is not None:
                    if isinstance(push.push_datetime, str):
                        push.push_datetime = datetime.strptime(
                            push.push_datetime, '%Y-%m-%d %H:%M:%S')
                    push_row += [push.push_datetime.strftime('%m/%d %H:%M:%S')]
                else:
                    push_row += ['']

                push_rows.append(push_row)

        user_list = self.db_session.query(User).all()
        for user in user_list:
            user_row = []
            if user.last_record:
                user_last_record = user.last_record[0]
                user_ip_asn = self.db_session.query(IpAsn).filter_by(
                    ip=user_last_record.last_login_ip).first()
                user_row += [
                    user.username or '', user.login_times or '',
                    user.valid_article_count or '',
                    str(user_last_record.last_login_datetime
                        or ''), user_last_record.last_login_ip or ''
                ]
                user_row += [
                    user_ip_asn.asn or '',
                    str(user_ip_asn.asn_date
                        or ''), user_ip_asn.asn_registry or '',
                    user_ip_asn.asn_cidr or '', user_ip_asn.asn_country_code
                    or '', user_ip_asn.asn_description or ''
                ]
            else:
                user_row += ['', '', '', '', '', '', '', '', '', '', '']
            user_rows.append(user_row)
        data.update({'Article': article_rows})
        data.update({'Push': push_rows})
        data.update({'User': user_rows})
        return data

    @log('Get Data')
    def _get_export_json(self):
        article_rows = []
        push_rows = []
        user_rows = []

        data = OrderedDict()
        article_list = self.db_session.query(Article).order_by(
            Article.post_datetime).all()

        for article in article_list:
            article_row = {
                'Atricle.web_id': article.web_id,
                'Article.board': article.board.name or '',
                'Atricle.author': article.user.username or ''
            }

            last_history = article.history[0]
            article_row.update({
                'Atricle.title': last_history.title or '',
                'Atricle.cotent': last_history.content or ''
            })

            article_ip_asn = self.db_session.query(IpAsn).filter_by(
                ip=article.post_ip).first()
            if article_ip_asn:
                article_row.update({
                    'Atricle.post_ip':
                    article_ip_asn.ip or '',
                    'Atricle.post_ip.asn': (article_ip_asn.asn or ''),
                    'Atricle.post_ip.asn_date':
                    str(article_ip_asn.asn_date or ''),
                    'Atricle.post_ip.asn_registry':
                    (article_ip_asn.asn_registry or ''),
                    'Atricle.post_ip.asn_cidr': (article_ip_asn.asn_cidr
                                                 or ''),
                    'Atricle.post_ip.asn_country_code':
                    (article_ip_asn.asn_country_code or ''),
                    'Atricle.post_ip.asn_description':
                    (article_ip_asn.asn_description or '')
                })
            else:
                article_row.update({
                    'Atricle.post_ip': article.post_ip or '',
                    'Atricle.post_ip.asn': '',
                    'Atricle.post_ip.asn_date': '',
                    'Atricle.post_ip.asn_registry': '',
                    'Atricle.post_ip.asn_cidr': '',
                    'Atricle.post_ip.asn_country_code': '',
                    'Atricle.post_ip.asn_description': ''
                })

            article_row.update({
                'Article.post_datetime':
                str(article.post_datetime or ''),
                'Article.last_modified_time':
                str(last_history.end_at or '')
            })

            article_rows.append(article_row)

            for push in last_history.push_list:
                push_row = {
                    'Push.article_web_id': article.web_id,
                    'Push.username': push.user.username or '',
                    'Push.tag': push.push_tag or '',
                    'Push.content': push.push_content or ''
                }

                push_ip_asn = self.db_session.query(IpAsn).filter_by(
                    ip=push.push_ip).first()
                if push_ip_asn:
                    push_row.update({
                        'Push.ip':
                        push_ip_asn.ip or '',
                        'Push.ip.asn':
                        push_ip_asn.asn or '',
                        'Push.ip.asn_cidr':
                        push_ip_asn.asn_cidr or '',
                        'Push.ip.asn_country_code':
                        push_ip_asn.asn_country_code or '',
                        'Push.ip.asn_date':
                        str(push_ip_asn.asn_date or ''),
                        'Push.ip.asn_description':
                        push_ip_asn.asn_description or '',
                        'Push.ip.asn_registry':
                        push_ip_asn.asn_registry or ''
                    })
                else:
                    push_row.update({
                        'Push.ip': push.push_ip or '',
                        'Push.ip.asn': '',
                        'Push.ip.asn_cidr': '',
                        'Push.ip.asn_country_code': '',
                        'Push.ip.asn_date': '',
                        'Push.ip.asn_description': '',
                        'Push.ip.asn_registry': ''
                    })

                if push.push_datetime is not None:
                    if isinstance(push.push_datetime, str):
                        push.push_datetime = datetime.strptime(
                            push.push_datetime, '%Y-%m-%d %H:%M:%S')
                    push_row.update({
                        'Push.datatime':
                        push.push_datetime.strftime('%m/%d %H:%M:%S')
                    })
                else:
                    push_row.update({'Push.datatime': ''})

                push_rows.append(push_row)

        user_list = self.db_session.query(User).all()
        for user in user_list:
            user_row = {}
            if user.last_record:
                user_last_record = user.last_record[0]
                user_ip_asn = self.db_session.query(IpAsn).filter_by(
                    ip=user_last_record.last_login_ip).first()
                user_row.update({
                    'User.username':
                    user.username or '',
                    'User.login_times':
                    user.login_times or '',
                    'User.valid_article_count':
                    user.valid_article_count or '',
                    'User.last_login_datetime':
                    str(user_last_record.last_login_datetime or ''),
                    'User.last_login_ip':
                    user_last_record.last_login_ip or '',
                    'User.last_login_ip.asn':
                    user_ip_asn.asn or '',
                    'User.last_login_ip.asn_date':
                    str(user_ip_asn.asn_date or ''),
                    'User.last_login_ip.asn_registry':
                    user_ip_asn.asn_registry or '',
                    'User.last_login_ip.asn_cidr':
                    user_ip_asn.asn_cidr or '',
                    'User.last_login_ip.asn_country_code':
                    user_ip_asn.asn_country_code or '',
                    'User.last_login_ip.asn_description':
                    user_ip_asn.asn_description or ''
                })
            else:
                user_row.update({
                    'User.username': user.username or '',
                    'User.login_times': '',
                    'User.valid_article_count': '',
                    'User.last_login_datetime': '',
                    'User.last_login_ip': '',
                    'User.last_login_ip.asn': '',
                    'User.last_login_ip.asn_date': '',
                    'User.last_login_ip.asn_registry': '',
                    'User.last_login_ip.asn_cidr': '',
                    'User.last_login_ip.asn_country_code': '',
                    'User.last_login_ip.asn_description': ''
                })
            user_rows.append(user_row)
        data.update({'Article': article_rows})
        data.update({'Push': push_rows})
        data.update({'User': user_rows})

        return data

    @log('Export Json')
    def _export_json(self):
        data = self._get_export_json()

        output_filename = 'Ptt_report_{export_datetime}'.format(
            export_datetime=datetime.now().strftime('%Y-%m-%d'))
        json_path = os.path.join(
            self.output_folder, '{prefix}{filename}.{file_format}'.format(
                prefix=self.output_prefix,
                filename=output_filename,
                file_format=self.file_format.name))
        with open(json_path, 'w') as jsonfile:
            json.dump(data, jsonfile, indent=4, sort_keys=True)

    @log('Export CSV')
    def _export_csv(self):
        data = self._get_export_rows()

        for (sheet, rows) in data.items():
            output_filename = 'Ptt_{sheet}_report_{export_datetime}'.format(
                sheet=sheet,
                export_datetime=datetime.now().strftime('%Y-%m-%d'))
            csv_path = os.path.join(
                self.output_folder, '{prefix}{filename}.{file_format}'.format(
                    prefix=self.output_prefix,
                    filename=output_filename,
                    file_format=self.file_format.name))
            with open(csv_path, 'w', encoding='utf-8') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter=',')
                for row in rows:
                    csvwriter.writerow(row)

    @log('Export Ods')
    def _export_ods(self):
        output_filename = 'Ptt_report_{export_datetime}'.format(
            export_datetime=datetime.now().strftime('%Y-%m-%d'))

        output_path = os.path.join(
            self.output_folder, '{prefix}{filename}.{file_format}'.format(
                prefix=self.output_prefix,
                filename=output_filename,
                file_format=self.file_format.name))
        data = self._get_export_rows()
        save_data(output_path, data)

    @log()
    def go(self, arguments: Dict[str, str]):
        self._init_helper(arguments)

        if self.file_format == ExportFormat.ods:
            self._export_ods()
        elif self.file_format == ExportFormat.csv:
            self._export_csv()
        elif self.file_format == ExportFormat.json:
            self._export_json()
        else:
            raise ValueError('File format error.')