def main(): # 取得未處理清單 # unluckyhouse.state=0 ROW_LIMIT = 100 LATEST_LIMIT = 11399 BASE_LIMIT = 11300 sql = 'SELECT id FROM unluckyhouse WHERE state=0 AND id>=? AND id<=? ORDER BY id DESC LIMIT ?' con = smart_dbapi.connect('unluckyhouse.sqlite') cur = con.execute(sql, (BASE_LIMIT, LATEST_LIMIT, ROW_LIMIT)) todolist = [] for row in cur: todolist.append(row['id']) print('分析範圍: {} ~ {}'.format(todolist[0], todolist[-1])) print('*' * 50) cur.close() # 使用 BeautifulSoup 4 分析文章 # http://unluckyhouse.com/archive/index.php/t-%d.html url = 'http://unluckyhouse.com/archive/index.php/t-{}.html' for t in todolist: # 同步最新文章時,限制編號下限 if t < BASE_LIMIT: break try: soup = smart_http.get(url.format(t)) if soup != False: analyze(con, t, soup) else: con.execute('UPDATE unluckyhouse SET state=-1 WHERE id=?', (t, )) print('主題 %d 已刪除' % t) del soup except Exception as e: print('分析文章 #{} 過程發生錯誤'.format(t)) print('前往 URL 確認吧: {}'.format(url.format(t))) print('-' * 50) traceback.print_exc() print('-' * 50) break con.commit() con.close()
def main(): # 取得未處理清單 # unluckyhouse.state=0 ROW_LIMIT = 100 LATEST_LIMIT = 11399 BASE_LIMIT = 11300 sql = 'SELECT id FROM unluckyhouse WHERE state=0 AND id>=? AND id<=? ORDER BY id DESC LIMIT ?' con = smart_dbapi.connect('unluckyhouse.sqlite') cur = con.execute(sql, (BASE_LIMIT, LATEST_LIMIT, ROW_LIMIT)) todolist = [] for row in cur: todolist.append(row['id']) print('分析範圍: {} ~ {}'.format(todolist[0], todolist[-1])) print('*' * 50) cur.close() # 使用 BeautifulSoup 4 分析文章 # http://unluckyhouse.com/archive/index.php/t-%d.html url = 'http://unluckyhouse.com/archive/index.php/t-{}.html' for t in todolist: # 同步最新文章時,限制編號下限 if t < BASE_LIMIT: break try: soup = smart_http.get(url.format(t)) if soup != False: analyze(con, t, soup) else: con.execute('UPDATE unluckyhouse SET state=-1 WHERE id=?', (t,)) print('主題 %d 已刪除' % t) del soup except Exception as e: print('分析文章 #{} 過程發生錯誤'.format(t)) print('前往 URL 確認吧: {}'.format(url.format(t))) print('-' * 50) traceback.print_exc() print('-' * 50) break con.commit() con.close()
def main(): try: # 取得台灣凶宅網的最新討論串 id # http://unluckyhouse.com/external.php # rss > channel > item > link (t=...) # resp = smart_http.request('unluckyhouse.com', '/external.php') resp = smart_http.get('https://unluckyhouse.com/external.php') if resp != False: latest_url = resp.find('channel/item/link').text m = re.match(r".+t=(\d+).+", latest_url) latest_id = int(m.group(1)) else: print('latest_id is -1') latest_id = -1 # SQLite 同步台灣凶宅網的 id,資料採用預設值 if latest_id is not -1: con = smart_dbapi.connect('unluckyhouse.sqlite') cur = con.execute('SELECT max(id) sync_id FROM unluckyhouse') row = cur.fetchone() if row['sync_id'] is not None: sync_id = row['sync_id'] else: sync_id = 0 if sync_id < latest_id: print('Add entries %d ~ %d' % (sync_id + 1, latest_id)) diff = range(sync_id + 1, latest_id + 1) for i in diff: sql = 'INSERT INTO unluckyhouse(id) VALUES (?)' con.execute(sql, (i, )) con.commit() else: print('Already synchronized (%d)' % latest_id) con.close() except Exception as ex: print(ex)
commons_path = os.path.realpath('../../commons') sys.path.insert(1, commons_path) import corp_utils import smart_dbapi from print_progress import print_progress ABORT_IF_ERROR = False BEGIN = 4151 END = 4441 script_begin = datetime.now() rank = 0 sql = 'SELECT id,corp,boss,gov FROM unluckylabor WHERE lat=0 AND id>=? AND id<=?' conn = smart_dbapi.connect('unluckylabor.sqlite') rows = conn.execute(sql, (BEGIN, END)).fetchall() # 蒐集要修改項目 error_cnt = 0 modified = 0 visited = 0 total = len(rows) for row in rows: info = corp_utils.get_corp_info(row['corp'], row['boss'], row['gov']) if info != False: # 連續定位失敗偵測 if len(info['addr']) >= 8 and info['lat'] == 0: # TODO: 定位失敗時,記錄到 log 檔 print('\n定位失敗: #%d %s %s' % (row['id'], row['corp'], info['addr']))
def get_conn(): global _conn if _conn is None: dbfile = '%s/corp_cache.sqlite' % CODEPATH _conn = smart_dbapi.connect(dbfile) return _conn
import re import sys import geojson commons_path = os.path.realpath('../../commons') sys.path.insert(1, commons_path) import smart_dbapi DEBUG = False sql = 'SELECT * FROM unluckylabor WHERE lat>20 ORDER BY id' if DEBUG: sql = sql + ' LIMIT 10' con = smart_dbapi.connect('unluckylabor.sqlite') cur = con.execute(sql) features = [] for row in cur: # 違反法律條文格式化 law_list = row['law'].split(';') law_desc = '' for e in law_list: if law_desc != '': law_desc = law_desc + '\n' m = re.match('(\d+)\-(\d+)', e) if m is not None: law_desc = law_desc + '勞動基準法第%s條第%s項' % (m.group(1), m.group(2)) else:
#!../../../bin/python # coding: utf-8 import os import sys import geojson commons_path = os.path.realpath('../../commons') sys.path.insert(1, commons_path) import smart_dbapi sql = 'SELECT * FROM unluckyhouse WHERE state>1 ORDER BY id DESC' con = smart_dbapi.connect('unluckyhouse.sqlite') cur = con.execute(sql) # 死法代碼對應文字 INITATIVE_TAGS = {"A": u"意外", "S": u"自殺", "M": u"他殺"} features = [] for row in cur: point = geojson.Point((row['lng'], row['lat'])) properties = { 'id': row['id'], 'news': row['news'], 'datetime': row['datetime'], 'address': row['area'] + row['address'], 'approach': '%s %s' % (INITATIVE_TAGS[row['initative']], row['approach']), 'marker-color': '#b00000', 'marker-symbol': 'danger'
#!../../../bin/python # coding: utf-8 import os import sys import geojson commons_path = os.path.realpath('../../commons') sys.path.insert(1, commons_path) import smart_dbapi sql = 'SELECT * FROM unluckyhouse WHERE state>1 ORDER BY id DESC' con = smart_dbapi.connect('unluckyhouse.sqlite') cur = con.execute(sql) # 死法代碼對應文字 INITATIVE_TAGS = {"A": u"意外", "S": u"自殺", "M": u"他殺"} features = [] for row in cur: point = geojson.Point((row['lng'], row['lat'])) properties = { 'id': row['id'], 'news': row['news'], 'datetime': row['datetime'], 'address': row['area'] + row['address'], 'approach': '%s %s' % (INITATIVE_TAGS[row['initative']], row['approach']), 'marker-color': '#b00000', 'marker-symbol': 'danger' }