def thread_fetch(thread_codes, thread_name, logger): my = MYSQL(thread_name) update_sql = "UPDATE fund SET updated=False WHERE code=%s;" code_set = thread_codes logger.info("%s => START CLEAR UPDATED FIELD!" % thread_name) my.update_many_data(update_sql, code_set) logger.info("%s => FINISH CLEAR UPDATED FIELD!" % thread_name) logger.info("%s => START CRAWLER URLS DATA!" % thread_name) datas = [] counter = 1 for code in thread_codes: logger.info("%s => %s => ** %s ** START PRASE DATA ..." % (thread_name, counter, code)) time.sleep(1) url = "%s%s.html" % (WEB_URL, code) ##value try: body = urllib2.urlopen(url).read() soup = BeautifulSoup(body, "lxml") except Exception, e: logger.error("%s => %s => ** %s ** URL OPEN FAIL :%s" % (thread_name, counter, code, str(e))) else: model = prase_content(soup, code) ## append data if model: datas.append(model.get_model_tuple()) else: logger.error("%s => %s => ** %s ** START PRASE DATA FAIL ..." % (thread_name, counter, code)) finally:
def get_codes(style, c): codes = [] if style == "all": main_sl = MYSQL("main") codes = main_sl.get_datas("select code from fund;") codes = [code[0] for code in codes] main_sl.close() elif style == "patch": patch_sl = MYSQL("patch") codes = patch_sl.get_datas( "select code from fund where updated=False;") codes = [code[0] for code in codes] patch_sl.close() elif style == "one": code = c codes.append(code) return codes
def init(self): #初始化mysql if self.cj == '共济': try: from mysql import MYSQL import xb_1 as cv self.cv = cv self.ms = MYSQL(host=self.host, port=self.port, user=self.user, pwd=self.pwd, db=self.db) self.flag = 1 except: self.flag = 0 self.sinOut.emit(2000) elif self.cj == '中联': try: from mssql import MSSQL import zl_1 as cv self.cv = cv self.ms = MSSQL(host=self.host, port=self.port, user=self.user, pwd=self.pwd, db=self.db) self.flag = 1 except: self.flag = 0 self.sinOut.emit(2001) elif self.cj == '栅格': try: from mysql import MYSQL import sg_1 as cv self.cv = cv self.ms = MYSQL(host=self.host, port=self.port, user=self.user, pwd=self.pwd, db=self.db) self.flag = 1 except: self.flag = 0 self.sinOut.emit(2000)
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import sqlite3 import re from mysql import MYSQL WEB_URL = "http://fund.eastmoney.com/" fo = open("./allfund.html").read() regex = re.compile("href=\"http://fund.eastmoney.com/\d+.html") funds = re.findall(regex, fo) web_codes = [f.split("/")[3][:-5] for f in funds] web_codes = list(set(web_codes)) print " >> %4s codes pull from web page !" % len(web_codes) my = MYSQL("pull") db_codes_sql = '''select code from fund;''' db_codes = my.get_datas(db_codes_sql) db_codes = [code[0] for code in db_codes] print " >> %4s codes in Database fund !" % len(db_codes) data_update = [] data_insert = [] for code in web_codes: url = "%s%s.html" % (WEB_URL, code) if code not in db_codes: i = (code, url) data_insert.append(i) else: u = ('niuniu', code) data_update.append(u)
from sqlalchemy import create_engine from datetime import datetime, timedelta sys.path.append('core') from mysql import MYSQL pd.options.mode.chained_assignment = None db_connection = 'mysql+pymysql://douzi@traffic110:1qaz!QAZ2wsx@[email protected]/traffic' # db_connection = 'mysql+pymysql://root:@localhost/accident' conn = create_engine(db_connection) df = pd.read_sql(""" SELECT a.ACCIDENT_NO, ACCIDENTDATE, ACCIDENTTIME, n.Lat, n.Long, a.SPEED_ZONE FROM accident as a LEFT JOIN node as n ON a.ACCIDENT_NO = n.ACCIDENT_NO """, conn) ms = MYSQL(host="traffic110.mysql.database.azure.com",user="******",pwd="1qaz!QAZ2wsx@WSX",db="traffic") # ms = MYSQL(host="localhost",user="******",pwd="",db="accident") df['ACCIDENTDATE'] = pd.to_datetime(df['ACCIDENTDATE']) df['ACCIDENTDATE_FOMAT'] = df['ACCIDENTDATE'].dt.strftime('%Y%m%d') df['ACCIDENTTIME'] = pd.to_datetime(df['ACCIDENTTIME']).dt.strftime('%H:00:00') df['Datetime'] = pd.to_datetime(df['ACCIDENTDATE'].apply(str) + ' ' + df['ACCIDENTTIME']) df['Timestrap'] = df['Datetime'].values.astype(np.int64) // 10 ** 9 #https://api.weather.com/v1/geocode/-37.688/144.841/observations/historical.json?apiKey=6532d6454b8aa370768e63d6ba5a832e&startDate=20060101&endDate=20060102&units=e base_url = 'https://api.weather.com/v1/geocode/' format_url = '/observations/historical.json?units=e' api_key = '&apiKey=6532d6454b8aa370768e63d6ba5a832e' def nearest(items, pivot): return min(items, key=lambda x: abs(x - pivot))
logger.info("%s => START UPDATE DATABSE ..." % thread_name) my.update_many_data(SQL, datas) except Exception, e: logger.error("%s => UPDATE DATABSE FAIL :%s" % (thread_name, str(e))) else: logger.info("%s => UPDATE DATABSE SUCCESS!" % thread_name) finally: logger.info("%s => UPDATE DATABSE FINISH!!!!" % thread_name) my.close() if __name__ == '__main__': style = sys.argv[1] codes = [] if style == "all": main_sl = MYSQL("main") codes = main_sl.get_datas("select code from fund;") codes = [code[0] for code in codes] main_sl.close() elif style == "patch": patch_sl = MYSQL("patch") codes = patch_sl.get_datas( "select code from fund where updated=False;") codes = [code[0] for code in codes] patch_sl.close() elif style == "one": code = sys.argv[2] codes.append(code) total = len(codes) threads = []
import re from settings import * from utils import * import pandas as pd from mysql import MYSQL import multiprocessing data = pd.read_csv('repo.csv', header=0) repo_names = data[0:-1]['name'] repo_ids = data[0:-1]['id'] user_names = data[0:-1]['owner'] mysql = MYSQL() # remove emoji def filter_emoji(text): try: re_emoji = re.compile( u'[' u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55' u'\u23cf' u'\u23e9' u'\u231a' u'\u3030' u'\ufe0f' u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u'\U00010000-\U0010ffff' u'\U0001F1E0-\U0001F1FF' # flags (iOS)
# -*- coding: utf-8 -*- from mysql import MYSQL import operator my = MYSQL("analyse") FIELDS = ["one_month", "three_month", "six_month", "one_year", "three_year"] def query_field(field, condition): sql = "select %s from fund where %s;" % (field, condition) infos = my.get_datas(sql) return infos def query_infos(type, count, field): sql = "select code,owner,name,level from fund where type like '%s' order by %s desc limit %s;" % ( type, field, count) infos = my.get_datas(sql) return infos def analyse_percent(end): type = "%债%" count = "50" codes = {} funds = [] time = "+".join(FIELDS[0:end]) result = set() for field in FIELDS[0:end]: infos = query_infos(type, count, field) #funds for info in infos:
df_negative['smoke'] = 0 df_negative['dust'] = 0 df_negative['strong_winds'] = 0 df_negative['wind_dir'] = '' df_negative['wind_speed'] = 0 df_negative['temperature'] = 0 df_negative['SURFACE_COND'] = 0 df_negative['NODE_TYPE'] = '' df_negative['Deg_Urban_Name'] = '' df_negative['target'] = 0 df_negative['accident_counts'] = 0 df_negative['date'] = df_negative['timestamp'].dt.strftime('%Y-%m-%d') # ms = MYSQL(host="localhost",user="******",pwd="",db="accident") ms = MYSQL(host="traffic110.mysql.database.azure.com", user="******", pwd="1qaz!QAZ2wsx@WSX", db="traffic") for i in range(df_negative.shape[0]): # for i in range(2): # print(df_negative.iloc[i]) getRoad = ms.ExecQuery(""" SELECT SPEED_ZONE, Light_Condition, ROAD_TYPE, DIRECTION_LOCATION, snowing, raining, foggy, smoke, dust, strong_winds, wind_dir, wind_speed, temperature, SURFACE_COND, NODE_TYPE, Deg_Urban_Name FROM positive_feature WHERE ACCIDENTDATE = '%s' AND Route_No = '%s' LIMIT 1 """ % (df_negative['date'][i], df_negative['Route_No'][i])) if getRoad: df_negative['SPEED_ZONE'][i] = getRoad[0][0] # if
def fetch(start, end, thread_name, logger): my = MYSQL(thread_name) for url in urls[start:end]: time.sleep(1) ##value try: body = urllib2.urlopen(url).read() soup = BeautifulSoup(body, "lxml") except Exception, e: logger.error("%s => FAIL :%s" % (url, str(e))) else: try: title = soup.find("title") model['name'] = title.text.split('(')[0] item01 = soup.find("dl", class_="dataItem01") model['evaluate_value'] = item01.contents[1].contents[0].text model['increase_value'] = item01.contents[1].contents[ 2].contents[0].text model['increase_percent'] = item01.contents[1].contents[ 2].contents[1].text[:-1] model['one_month'] = item01.contents[2].contents[1].text[:-1] model['one_year'] = item01.contents[3].contents[1].text[:-1] item02 = soup.find("dl", class_="dataItem02") model['per_value'] = item02.contents[1].contents[0].text model['per_value_percent'] = item02.contents[1].contents[ 1].text[:-1] model['three_month'] = item02.contents[2].contents[1].text[:-1] model['three_year'] = item02.contents[3].contents[1].text[:-1] item03 = soup.find("dl", class_="dataItem03") model['total_value'] = item03.contents[1].contents[0].text model['six_month'] = item03.contents[2].contents[1].text[:-1] model['till_now'] = item03.contents[3].contents[1].text[:-1] tables = soup.find_all("table") model['type'] = tables[2].contents[0].contents[0].text.split( "|")[0] model['size'] = tables[2].contents[0].contents[1].contents[1][ 1:] model['manager'] = tables[2].contents[0].contents[2].contents[ 1].text model['start_date'] = tables[2].contents[1].contents[ 0].contents[1][1:] model['owner'] = tables[2].contents[1].contents[1].contents[ 2].text #model['level'] = tables[2].contents[1].contents[2].contents[2].text level = tables[2].contents[1].contents[2].contents[2].attrs[ 'class'][0] if len(level) > 4: model['level'] = level[4] else: model['level'] = 0 except IndexError, e: infoItem = soup.find("div", class_="fundInfoItem") model['wan_get'] = infoItem.contents[0].contents[0].contents[ 1].text model['seven_get'] = infoItem.contents[0].contents[2].contents[ 1].text[:-1] model['fourting_get'] = infoItem.contents[0].contents[ 4].contents[1].text[:-1] model['two_eghit_get'] = infoItem.contents[0].contents[ 6].contents[1].text[:-1] model['one_month'] = infoItem.contents[1].contents[0].contents[ 0].contents[1].text[:-1] model['one_year'] = infoItem.contents[1].contents[0].contents[ 1].contents[1].text[:-1] model['three_month'] = infoItem.contents[1].contents[ 1].contents[0].contents[1].text[:-1] model['three_year'] = infoItem.contents[1].contents[ 1].contents[1].contents[1].text[:-1] model['six_month'] = infoItem.contents[1].contents[2].contents[ 0].contents[1].text[:-1] model['till_now'] = infoItem.contents[1].contents[2].contents[ 1].contents[1].text[:-1] tables = soup.find_all("table") model['type'] = tables[2].contents[0].contents[0].text.split( "|")[0] model['size'] = tables[2].contents[0].contents[1].contents[1][ 1:] model['manager'] = tables[2].contents[0].contents[2].contents[ 1].text model['start_date'] = tables[2].contents[1].contents[ 0].contents[1][1:] model['owner'] = tables[2].contents[1].contents[1].contents[ 2].text #model['level'] = tables[2].contents[1].contents[2].contents[2].text level = tables[2].contents[1].contents[2].contents[2].attrs[ 'class'][0] if len(level) > 4: model['level'] = level[4] else: model['level'] = 0 except Exception, e: logger.error("%s => FAIL :%s" % (url, str(e)))
model['level'], url) ## insert data try: my.insert_data(sql) except Exception, e: logger.error("%s => FAIL :%s" % (url, str(e))) else: logger.debug("%s => OK" % url) my.close() if __name__ == '__main__': style = sys.argv[1] urls = [] if style == "all": main_sl = MYSQL("main") us = main_sl.get_datas("select url from fund;") urls = [url[0] for url in us] main_sl.close() elif style == "patch": urls = check(True) elif style == "one": url = sys.argv[2] urls.append(url) total = len(urls) threads = [] exec_shell_result("rm -rf /var/log/crawler/*.*") for i in range(100): my_urls_start = i * 100 my_urls_end = i * 100 + 100