create index site_index on filtered_data_index(site_type); create index keyword_index on filtered_data_index(keyword); create index timestamp_index on filtered_data_index(timestamp); create index site_sum_index on sumdata_byday(site_type); create index keyword_sum_index on sumdata_byday(keyword); create index timestamp_sum_index on sumdata_byday(timestamp); """ import os, sys from datetime import datetime from dbutils import DBUtils from strutils import get_timestamp print datetime.now() if len(sys.argv) < 2: timestamp = get_timestamp(day_delta=1) else: timestamp = sys.argv[1] db = DBUtils() c = db.execute_sql( "select count(*),site_type,keyword,timestamp where timestamp=%s group by keyword,site_type" % timestamp ) for one in c.fetchall(): count, site_type, timestamp = one values = {"timestamp": timestamp, "count": count, "keyword": keyword, "site_type": site_type} db.insert("sumdata_byday", values) db.close()
blogt varchar(255) NULL, date DATE NULL, time TIME NULL, url varchar(255) NULL, keyword varchar(255) NULL, title varchar(255) NULL, article TEXT NULL );\n create index site_index_%(timestamp)s on %(table_name)s(site);\n create index date_index_%(timestamp)s on %(table_name)s(date);\n """ DATA_ROOT = "/data1/dspider/data/bak/%s/" % timestamp db = DBUtils() db.execute_sql(CREATE_SQL % {"table_name": table_name, "timestamp": timestamp}) db.close() db = DBUtils() for pathname in os.listdir(DATA_ROOT): path = os.path.join(DATA_ROOT, pathname) for filename in os.listdir(path): data_file = os.path.join(path, filename) try: f = open(data_file, "r") column = {} while 1: line = f.readline() if not line: f.close() break