Exemple #1
0
def deal_items(get_items, row):

    # 添加的记录数和跳过的记录数
    add_count, skip_count = 0, 0
    # 最后一篇文章发表的时间
    last_item_time = str(row['item_time'])
    # 本次更新完以后的最后发表时间
    new_item_time = last_item_time
    insert_ret = 0
    for item in get_items:

        # print item['pub_time']
        # print last_item_time
        # print item['pub_time'] < last_item_time
        # sys.exit()


        # 判断发表时间决定是否该入库
        if item['pub_time'] <= last_item_time:
            skip_count = skip_count + 1
            utl.log("\tskip for " + str(item['pub_time']) + ' < ' + str(last_item_time))
            continue
        else:
            add_count = add_count + 1
            utl.log("\tadd for " + str(item['pub_time']) + ' > ' + str(last_item_time))
        # 对比得出最新的文章的时间
        if item['pub_time'] > new_item_time:
            utl.log("\tupdate pub_time for " + item['pub_time'] + " is > " + new_item_time)
            new_item_time = item['pub_time']

        #  tags  字段借做他用,用来标记 origins 中的 name
        item['tags'] = row['name']

        # 调整矫正时间差 8 个小时的问题
        old_time = item['pub_time']
        # 兼容可能出现只有日期的情况
        if len(old_time) > 7 and len(old_time) < 11:
            item_timestamp = time.mktime(time.strptime(old_time, '%Y-%m-%d'))        
        else:
            item_timestamp = time.mktime(time.strptime(old_time, '%Y-%m-%d %X'))
        adjust_time = item_timestamp + 8 * 3600
        item['pub_time'] = time.strftime('%Y-%m-%d %X', time.localtime( adjust_time ) )


        # 记录入库
        # insert_ret =  1 ###DEBUG
        insert_ret = insert_item(item)
        utl.log("insert a record, result in " + str(insert_ret))

    return {
        'item_time' : new_item_time,
        'add_count' : add_count,
        'skip_count' : skip_count,
        'insert_ret' : insert_ret,
    }
Exemple #2
0
def insert_item(record):
    # 检查必要参数
    neces = ['title', 'url']
    # neces = ['title', 'url', 'author', 'author_url', 'summary']
    for field in neces:
        if field not in record:
            utl.log("ERR there is no field: " + field, 1)
            return 0

    # 长度校验
    if len(record['detail']) > 250:
        record['detail'] = record['detail'][:250]
            
    conn = MySQLdb.connect(host=gl.DBHOST, user=gl.DBUSER, passwd=gl.DBPASS, 
        db='laravel_db', charset='utf8')
    rs = conn.cursor(cursorclass=MySQLdb.cursors.DictCursor)

    #hicktodo 从标题,以及来源和时间着手,防止重复插入记录
    insert_ret = 0
    try:
        insert_ret = rs.executemany(
            """INSERT INTO chips (title, url, summary, author, author_url, detail, pub_time, tags, source, created_at, updated_at)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
            [
                # (record['url'], record['url'], record['summary'], 'Hick1', 'detail1', ),
                # ('testTitle2', 'testUrl2', 'TestSummay2', 'Hick2', 'detail2', '2013-02-22 12:21', 'php,', 0),
                (record['title'], record['url'], record['summary'], record['author'], record['author_url'], record['detail'], record['pub_time'], 
                    record['tags'], record['source'], time.strftime('%Y-%m-%d %H:%M'), time.strftime('%Y-%m-%d %H:%M'))
            ] )
    except Exception as e:
        utl.log("ERR insert a record: " + str(e), 1)
    finally:
        rs.close()
        conn.commit()
        conn.close()

    # utl.log("DEBUG==================== %s" % record['source'], 1)
    
    return insert_ret
Exemple #3
0
def backup_local():
    ''' rsync '''

    log(u'開始本機備份:將 mycis 系統映射至 %s 磁碟機上...' % sd)
    try:
        r = subprocess.check_call([rsync, '-rltz', '--delete', '--progress', '--modify-window=1', '/cygdrive/d/mycis/', '/cygdrive/%s/mycis/' % sd])
        log(u'本機備份完成!')       
    except:
        log(u'無法將 mycis 系統映射至 %s 磁碟機上,本機備份失敗...' % sd)
Exemple #4
0
def backup_remote_inc_ver():
    td = tempfile.mkdtemp(dir=tmp)
    
    log('Starting to do incremental backup !')
    log('Generating version string ...')
    ver = cat(repo, 'version')
    if not os.path.isfile(ver):        
        log('Initialize the version string: starting from 0.')
        open(ver, 'w').write('0')
    ed = int(open(ver, 'r').read())
    ed_old = str(ed).zfill(8)
    ed_new = str(ed + 1).zfill(8)
    
    log('Get database of previous version ...')
    old = cat(repo, db)
    if not os.path.isfile(old):
        log('This is the first time doing delta. Copy to repo.')
        shutil.copyfile(db, old) 
    old_coded = cat(repo, 'mycis_coded.db')
     
    new_ = cat(base, db)
    new = cat(td, db)
    # Use a copy to generate delta
    log('Use a copy to generate delta, otherwise there will be errors ...')
    shutil.copyfile(new_, new)
    
    delta_fn = 'delta-%s-%s' % (ed_old, ed_new)
    delta = cat(td, delta_fn)    
    
    log('Computing md5 ...')
    md5_old = hashlib.md5(open(old, 'rb').read()).hexdigest()
    md5_new = hashlib.md5(open(new, 'rb').read()).hexdigest()

    if md5_old == md5_new:
        log('No new delta is needed: Stop.')
        return
    
    log('Now generating delta, please wait ...')
    subprocess.call([xdelta, '-e', '-f', '-s', old, new, delta])
    
    # msg delta ex.:
    # From: cwtu_001, To: admin 
    # Subject: delta-00000001-00000002
    # Text:
    # 00000001=(md5 of mycis.db ed.00000001)
    # 00000002=(md5 of mycis.db ed.00000002)
    # Attachment: delta-00000001-00000002
    
    log('Authoring the delta msg ...')
    msg = message(cid, (admin,), delta_fn, '%s=%s\n%s=%s' % (ed_old, md5_old, ed_new, md5_new), [delta])
    
    log('Uploading delta, please wait ...')
    try:
        ch = channel()
        ch.append(cid, '', msg_time(), str(msg))
        # Render old <- new.
        subprocess.call([xdelta, '-d', '-f', '-s', old, delta, old_coded])
               
        md5_ = hashlib.md5(open(old_coded, 'rb').read()).hexdigest()
        if md5_ == md5_new:
            shutil.move(old_coded, old)
        
        else: # Should be very rare ...
            log('')
            os.remove(old_coded)
            return
        
        ed += 1
        open(ver, 'w').write(str(ed))
        log('Upload delta successfully !')

    except:
        log('Upload delta / Manage File unsuccessfully. Please check all settings & report to cytu !')

    finally:    
        try:
            ch.close()
        except:
            pass
        try:
            ch.logout()
        except:
            pass
        cls(td)
Exemple #5
0
def backup_remote_all(dbc=''):
        
    td = tempfile.mkdtemp(dir=tmp)
    ud_s = cat(td, '%s.7z' % db)

    new_ = dbc if dbc else cat(base, db)
    new = cat(td, db)
    
    try:
        log(u'開始上傳完整資料庫。製作完整資料庫壓縮檔...')
        shutil.copyfile(new_, new)
        r = subprocess.check_call([z, 'a', '-t7z',  ud_s, new])
    
    except:
        log(u'完整資料庫壓縮檔無法製作,上傳失敗...')
        return False
    
    md5 = hashlib.md5(open(new, 'rb').read()).hexdigest()
    
    # msg backup ex.:
    # From: cwtu_001 
    # To: admin 
    # Subject: backup 2009-10-11 06:00:00
    # Text:
    # (md5 of db)
    # Attachment: mycis.db.7z

    msg = message(cid, (admin,), 'backup ' + time_stamp(), md5, [ud_s])
    
    try:
        log(u'完整資料庫上傳需要幾分鐘;請勿關閉視窗,耐心等候!')
        ch = channel()
        ch.append(cid, '', msg_time(), str(msg))

        # Delete all previous delta, if any.
        try:
            ch.select('[Google Mail]/All Mail')
            r, [ids] = ch.search(None, '(FROM "%s" TEXT "delta")' % cid)
            if ids.split():
                ch.copy(','.join(ids.split()), '[Google Mail]/Trash')  

        except:
            log(u'(請手動清除備份紀錄)')

        log(u'完整資料庫上傳成功!')
        ans = True

    except:
        log(u'完整資料庫上傳失敗...')
        ans = False

    finally:    
        try:
            ch.close()
        except:
            pass
        try:
            ch.logout()
        except:
            pass

        cls(td)
        return ans
Exemple #6
0
def backup_remote_inc():
    log(u'開始遠端備份 ...')
    
    td = tempfile.mkdtemp(dir=tmp)

    old = cat(repo, db)
    if not os.path.isfile(old):
        log(u'This is the first time doing backup. Copy to repo.')
        shutil.copyfile(db, old) 
     
    new_ = cat(base, db)
    new = cat(td, db)
    # Use a copy to generate delta
    #log('Use a copy to generate delta, otherwise there will be errors ...')
    shutil.copyfile(new_, new)
    
    delta_fn = 'delta' 
    delta = cat(td, delta_fn)    
    
    md5_old = hashlib.md5(open(old, 'rb').read()).hexdigest()
    md5_new = hashlib.md5(open(new, 'rb').read()).hexdigest()

    if md5_old == md5_new:
        log(u'新舊資料庫版本相同,無需遠端備份!')
        cls(td)
        return
    
    log(u'製作資料庫差異檔中,請稍候...')
    subprocess.check_call([xdelta, '-e', '-f', '-s', old, new, delta])
    
    # msg delta ex.:
    # From: cwtu_001, To: admin 
    # Subject: delta 2009-10-11 06:00:00
    # Text:
    # (md5 of old db)
    # (md5 of new db)
    # Attachment: delta
     
    tm = time_stamp()    
    msg = message(cid, (admin,), delta_fn + ' ' + tm, '%s\n%s' % (md5_old, md5_new), [delta])
    log(u'上傳資料庫差異檔中,請稍候...')
    try:
        ch = channel()
        ch.append(cid, '', msg_time(), str(msg))
              
        log(u'遠端備份成功!')

    except:
        log(u'遠端備份失敗...')

    finally:    
        try:
            ch.close()
            ch.logout()
        except:
            pass
        cls(td)
Exemple #7
0
Fichier : rss.py Projet : hick/utl
def parse(txt):
    # print "parse txt"
    r = feedparser.parse(txt)

    # print r.version
    # sys.exit()

    article_list = []
    if len(r.entries) > 0:
        for item in r.entries:

            # for i in item:
            #     print "----"
            #     print i, ":"    
            #     print item[i]
            # sys.exit()

            # if hasattr(item, 'author_detail'):
            #     continue
            # else:
            #     for i in item:
            #         print "----"
            #         print i, ":"    
            #         print item[i]
            #     sys.exit()


            ### 可能存在某些字段不存在等等错误,先以 log 方式记录
            url = item.link
            try:
                ## 博客在线的 url 太 tnnd 长了, 发现有 jobbole.com 的统统截断 问号以后的
                find_bole = url.find("jobbole.com")
                if find_bole > 0 and find_bole < 20:
                    url = url.split("?")[0]

                ## 发现 http://www.lupaworld.com 的 rss20 有些 author_detail 都没有
                if hasattr(item, 'author_detail'):
                    author = item.author_detail.name
                    author_url = hasattr(item.author_detail, 'href') and item.author_detail.href or url
                else:
                    author = ''
                    author_url = item.link

                ### 出现没有 published_parsed 的
                ### print item 可以看到所有属性列表,  dir 反而不行, 不知道为啥
                if hasattr(item, "published_parsed"):
                    pub_time = time.strftime('%Y-%m-%d %H:%M:%S', item.published_parsed)
                ### ATOM 支持的格式
                elif hasattr(item, "updated_parsed"):
                    pub_time = time.strftime('%Y-%m-%d %H:%M:%S', item.updated_parsed)
                else:
                    ##################### 把 URL MD5 保存以判断是否抓取过:  /tmp/url_MD5 , 存在则不继续添加, 否则时间我当前继续添加
                    ### item.updated 是这样的格式,暂时不知道怎么转换: 2011-12-20T12:38:22+00:00
                    pub_time = time.strftime(ISOTIMEFORMAT, item.updated) 
                    print pub_time
                    print "\n\n\n\n\n\n\n\n"
                    return


                
                article = {
                    'title': item.title,
                    'url': url,
                    'summary': item.summary[0:252],
                    'author': author,
                    'author_url': author_url,
                    'pub_time': pub_time,
                    'tags': '',
                    'source': 1,
                    'detail': hasattr(item, 'content') and html2text.html2text(item.content[0].value) or item.summary,
                }

                # for i in article:
                #     print "----"
                #     print i, ":"    
                #     print article[i]
                # sys.exit()
                article_list.append(article)

            except Exception, e:
                utl.log("!!!!!!!!ERR: %s , for url: %s" % (e, url), 1)
Exemple #8
0
    del_sql = "delete from chips where deleted > 0"
    rs.execute(del_sql)
    conn.commit()

    
    #hicktodo 这里时间要改成限定范围
    # rs.execute("SELECT * FROM origins WHERE name = 'rss'")
    sql = "SELECT * FROM origins WHERE" + sql_where
    rs.execute(sql)

    # print sql
    # sys.exit()

    all_rows = rs.fetchall()
    if len(all_rows) < 1:
        utl.log("no source need to be spidered", 1)

    for row in all_rows:
        # 资源 id
        res_id = row['id']
        # 模块名只用来定义逻辑,同一个模块名可能涉及多个 id , 比如 rss 模块
        mod_name = row['name']
        # 判断如果模块不存在,则报告一个错误以后继续其他操作
        if not os.path.isfile(mod_name + '.py'):
            utl.log("!!!!!ERR: no module defined for " + mod_name, 1)
            continue

        mod = importlib.import_module(mod_name)
        utl.log("======== start " + mod_name  + str(res_id) + " " + utl.get_host(row['url']), 1)
        #hicktodo 需要增加的字段,先用保留字段