def do_import_rtq(db, password='', newfileonly=False): loader = FileLoader() db = DB(password=password, db=db) db.create_table_deadlink() # print 'try loading', sys.argv[1] # lines = loader.load( # sys.argv[1]) newdate = str(db.get_last_day('deadlink')) if newdate == 'None': newfileonly = False for file in utils.dir_listfile('result'): rawdatestr = utils.stripDateStr(file).group(1) datestr = utils.parseDateString(rawdatestr) if newfileonly: if datestr > newdate: lines = loader.load(file, 7) print 'staring insert lines from', file, 'datetime is', datestr db.inserts_deadlink(lines, date=datestr) print 'insert completed' else: print datestr, 'skiped' else: lines = loader.load(file, 7) print 'staring insert lines from', file, 'datetime is', datestr db.inserts_deadlink(lines, date=datestr) print 'insert completed'
def do_import_rcu(db, password='', newfileonly=False): loader = FileLoader() db = DB(password=password, db=db) db.create_table_deadlink_classify() cates = ['aladdin', 'h5', 'lightaap', 'normal', 'siteapp', 'tc'] newdate = str(db.get_last_day('deadlink')) if newdate == 'None': newfileonly = False def httpcodeNot200(line): return line[2] != '200' for category in cates: for file in utils.dir_listfile( 'result', subdirprefix='result_spider_random_classfiy_url', fileSubPrefix='result_spider_10000_' + category): datestr = utils.getDateFromStr(file) if newfileonly: if datestr > newfileonly: lines = loader.load(file, 5) try: filteredlines = [ i for i in itertools.ifilter(httpcodeNot200, lines) ] except TypeError, e: print >> sys.stderr, e.args continue print 'deads', len( filteredlines ), 'staring insert lines from', file, 'datetime is', datestr db.inserts_deadlink_classify(filteredlines, cls=category, date=datestr) print 'insert completed' else: print datestr, 'skiped' else: lines = loader.load(file, 5) try: filteredlines = [ i for i in itertools.ifilter(httpcodeNot200, lines) ] except TypeError, e: print >> sys.stderr, e.args continue print 'deads', len( filteredlines ), 'staring insert lines from', file, 'datetime is', datestr db.inserts_deadlink_classify(filteredlines, cls=category, date=datestr) print 'insert completed'
def do_import_rcu(db, password='', newfileonly=False): loader = FileLoader() db = DB(password=password, db=db) db.create_table_deadlink_classify() cates = ['aladdin', 'h5', 'lightaap', 'normal', 'siteapp', 'tc'] newdate = str(db.get_last_day('deadlink')) if newdate == 'None': newfileonly = False def httpcodeNot200(line): return line[2] != '200' for category in cates: for file in utils.dir_listfile('result', subdirprefix='result_spider_random_classfiy_url', fileSubPrefix='result_spider_10000_' + category): datestr = utils.getDateFromStr(file) if newfileonly: if datestr > newfileonly: lines = loader.load(file, 5) try: filteredlines = [i for i in itertools.ifilter(httpcodeNot200, lines)] except TypeError, e: print >> sys.stderr, e.args continue print 'deads', len(filteredlines), 'staring insert lines from', file, 'datetime is', datestr db.inserts_deadlink_classify(filteredlines, cls=category, date=datestr) print 'insert completed' else: print datestr, 'skiped' else: lines = loader.load(file, 5) try: filteredlines = [i for i in itertools.ifilter(httpcodeNot200, lines)] except TypeError, e: print >> sys.stderr, e.args continue print 'deads', len(filteredlines), 'staring insert lines from', file, 'datetime is', datestr db.inserts_deadlink_classify(filteredlines, cls=category, date=datestr) print 'insert completed'
def do_test(db, password=''): db = DB(password=password, db=db) ret = db.get_last_day('classify') print 'last day is', ret