def filter_titles(db_path): month = str(date.today()).split('-')[1] year_mon = ''.join(str(date.today()).split('-')[:-1]) days = {'01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31', '09': '30', '10': '31', '11': '30', '12': '31'} coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all') condition = {'date': {'$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359')}} ######################################################## print 'db `title` is loading now, waiting .......' ######################################################## filedb = FileBsd('hash', db_path) for k, doc in enumerate(coll.query(condition)): try: filedb.put(md5(doc['title'])) except Exception as e: print 'filter_titles error:', e coll.disconnect() filedb.close() ##################################################### print 'title filter loading finished'
def match_ratio(file_db_path, title, lock): val = 1 # 表示正常的 with lock: filedb = FileBsd('hash', file_db_path) if secondary_filter(title.strip()): val = 2 # 表示过滤的 tit_comp = md5(title.strip()) if filedb.has_key(tit_comp): val = 0 # 表示上一次抓过的 else: filedb.put(tit_comp) filedb.close() return val
def filter_titles(db_path): month = str(date.today()).split('-')[1] year_mon = ''.join(str(date.today()).split('-')[:-1]) days = { '01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31', '09': '30', '10': '31', '11': '30', '12': '31' } coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all') condition = { 'date': { '$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359') } } ######################################################## print 'db `title` is loading now, waiting .......' ######################################################## filedb = FileBsd('hash', db_path) for k, doc in enumerate(coll.query(condition)): try: filedb.put(md5(doc['title'])) except Exception as e: print 'filter_titles error:', e coll.disconnect() filedb.close() ##################################################### print 'title filter loading finished'