# coding=utf-8 import os import MySQLdb from hashlib import md5 from BeautifulSoup import BeautifulSoup import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from services.db import MySQLdbWrapper _db = MySQLdbWrapper() def init_file(file): dic = { 'id': file[0], 'name': file[1], 'icon_link': file[2], 'icon_path': file[3], 'source': file[4], 'source_link': file[5], 'rating': file[6], 'version': file[7], 'developer': file[8], 'sdk_support': file[9], 'category': file[10], # 'screen_support':file[11], 'screen_support': None, 'apk_size': file[12], 'language': file[13], 'publish_date': file[14], 'downloads': file[15],
_seed_url_list = [] # check and collect valid url for i in range(1000): url = _base_url % ('2', str(i + 1)) _seed_url_list.append(url) for i in range(2000): url = _base_url % ('1', str(i + 1)) _seed_url_list.append(url) import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from services.db import MySQLdbWrapper cursor = MySQLdbWrapper().cursor() insert_count = 0 update_count = 0 for seed_url in _seed_url_list: # the priority maybe update to failed/missing select_sql = "select priority from new_link where source='zhushou.360.cn' and link='%s'" % seed_url cursor.execute(select_sql) results = cursor.fetchall() if len(results) == 0: insert_sql = "insert into new_link (id, source, link, last_crawl, priority) values ('%s', 'zhushou.360.cn', '%s', 1, 10);" % ( md5(seed_url).hexdigest().upper(), seed_url) cursor.execute(insert_sql) _conn.commit() insert_count += 1 # if priority <> 10, the link has been reported as failed or missing etc. , so update it to normal
import MySQLdb import smtplib import time import datetime import os import pickle from platform import node import sys reload(sys) getattr(sys, 'setdefaultencoding')('utf-8') sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from services.db import MySQLdbWrapper _db = MySQLdbWrapper() cursor = _db.cursor() results = [] last_date = int(time.time()) - 60 * 60 * 24 date_from = datetime.datetime.strptime( (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y%m%d'), '%Y%m%d') date_to = date_from + datetime.timedelta(days=1) #now = datetime.datetime.now() #last_date = int(datetime.datetime.strptime(now.strftime('%Y-%m-%d'), '%Y-%m-%d').strftime('%s')) #sql = 'SELECT a.source,COUNT(DISTINCT b.package_name) AS count_p FROM app a JOIN final_app b ON a.source_link = b.source_link where a.tag > %d GROUP BY a.source;' % last_date sql = 'SELECT a.source,COUNT(DISTINCT b.package_name) AS count_p FROM app a JOIN final_app b ON a.source_link = b.source_link where b.created_at between %s and %s GROUP BY a.source;' cursor.execute(sql, (date_from, date_to)) results_all = cursor.fetchall()
def get_db(): if hasattr(_db, 'db'): return _db.db else: _db.db = MySQLdbWrapper() return _db.db
import smtplib import time import datetime import os import pickle from platform import node import sys reload(sys) getattr(sys, 'setdefaultencoding')('utf-8') sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from services.db import MySQLdbWrapper _db = MySQLdbWrapper() cursor = _db.cursor() results = [] last_date = int(time.time()) - 60 * 60 * 24 date_from = datetime.datetime.strptime( (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y%m%d'), '%Y%m%d') date_to = date_from + datetime.timedelta(days=1) #now = datetime.datetime.now() #last_date = int(datetime.datetime.strptime(now.strftime('%Y-%m-%d'), '%Y-%m-%d').strftime('%s')) #sql = 'SELECT a.source,COUNT(DISTINCT b.package_name) AS count_p FROM app a JOIN final_app b ON a.source_link = b.source_link where a.tag > %d GROUP BY a.source;' % last_date sql = 'SELECT a.source,COUNT(DISTINCT b.package_name) AS count_p FROM app a JOIN final_app b ON a.source_link = b.source_link where b.created_at between %s and %s GROUP BY a.source;' cursor.execute(sql, (date_from, date_to)) results_all = cursor.fetchall()
_seed_url_list = [] # check and collect valid url for i in range(1000): url = _base_url % ('2', str(i + 1)) _seed_url_list.append(url) for i in range(2000): url = _base_url % ('1', str(i + 1)) _seed_url_list.append(url) import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from services.db import MySQLdbWrapper cursor = MySQLdbWrapper().cursor() insert_count = 0 update_count = 0 for seed_url in _seed_url_list: # the priority maybe update to failed/missing select_sql = "select priority from new_link where source='zhushou.360.cn' and link='%s'" % seed_url cursor.execute(select_sql) results = cursor.fetchall() if len(results) == 0: insert_sql = "insert into new_link (id, source, link, last_crawl, priority) values ('%s', 'zhushou.360.cn', '%s', 1, 10);" % ( md5(seed_url).hexdigest().upper(), seed_url) cursor.execute(insert_sql) _conn.commit() insert_count += 1