Example #1
0
def filter_titles(db_path):
    month = str(date.today()).split('-')[1]
    year_mon = ''.join(str(date.today()).split('-')[:-1])
    days = {'01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31',
            '09': '30', '10': '31', '11': '30', '12': '31'}
    coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all')
    condition = {'date': {'$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359')}}

    ########################################################
    print 'db `title` is loading now, waiting .......'
    ########################################################

    filedb = FileBsd('hash', db_path)
    for k, doc in enumerate(coll.query(condition)):
        try:
            filedb.put(md5(doc['title']))
        except Exception as e:
            print 'filter_titles error:', e
    coll.disconnect()
    filedb.close()

    #####################################################
    print 'title filter loading finished'
Example #2
0
def match_ratio(file_db_path, title, lock):
    val = 1  # 表示正常的
    with lock:
        filedb = FileBsd('hash', file_db_path)
        if secondary_filter(title.strip()):
            val = 2  # 表示过滤的

        tit_comp = md5(title.strip())
        if filedb.has_key(tit_comp):
            val = 0  # 表示上一次抓过的
        else:
            filedb.put(tit_comp)
        filedb.close()
    return val
Example #3
0
def match_ratio(file_db_path, title, lock):
    val = 1  # 表示正常的
    with lock:
        filedb = FileBsd('hash', file_db_path)
        if secondary_filter(title.strip()):
            val = 2  # 表示过滤的
        
        tit_comp = md5(title.strip())
        if filedb.has_key(tit_comp):
            val = 0  # 表示上一次抓过的
        else:
            filedb.put(tit_comp)
        filedb.close()
    return val
Example #4
0
def filter_titles(db_path):
    month = str(date.today()).split('-')[1]
    year_mon = ''.join(str(date.today()).split('-')[:-1])
    days = {
        '01': '31',
        '02': '28',
        '03': '31',
        '04': '30',
        '05': '31',
        '06': '30',
        '07': '31',
        '08': '31',
        '09': '30',
        '10': '31',
        '11': '30',
        '12': '31'
    }
    coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all')
    condition = {
        'date': {
            '$gte': long(year_mon + '01000000'),
            '$lte': long(year_mon + days.get(month) + '232359')
        }
    }

    ########################################################
    print 'db `title` is loading now, waiting .......'
    ########################################################

    filedb = FileBsd('hash', db_path)
    for k, doc in enumerate(coll.query(condition)):
        try:
            filedb.put(md5(doc['title']))
        except Exception as e:
            print 'filter_titles error:', e
    coll.disconnect()
    filedb.close()

    #####################################################
    print 'title filter loading finished'