def crawl_content_in_queue(q): ''' 爬取queue中url对应的content ''' conn = get_db_conn() cursor = conn.cursor() while True: try: url = q.get(block=True, timeout=3) except Empty: #队列空,重试 print('queue empty.retry...') continue if url is None: #任务结束 break print('done') else: print('url=%s' % url) page = get_page('http://' + url) row = parse_content(page) #存入数据库 if row['title'] != '' and row['content'] != '': command='SELECT insert_page_content($PGDATA$%s$PGDATA$,'+\ '$PGDATA$%s$PGDATA$,$PGDATA$%s$PGDATA$,$PGDATA$%s$PGDATA$,$PGDATA$%s$PGDATA$);' cursor.execute(command % (url, row['title'], row['content'], row['keywords'], row['tags'])) is_duplicate = not cursor.fetchall()[0][0] if is_duplicate: print('已存在.url=%s' % url) conn.commit() else: print('empty.url=%s' % url) conn.close() pass
def crawl_content_in_db(): ''' 爬取db中的url对应的content ''' conn = get_db_conn() cursor = conn.cursor() cursor.execute('SELECT * FROM url_to_be_get;') url_table = cursor.fetchall() q = Queue() for row in url_table: q.put(row[0]) for i in range(THREAD_NUM): q.put(None) print('queue init done.') tlist = [ threading.Thread(target=crawl_content_in_queue, args=(q, )) for i in range(THREAD_NUM) ] print('start thread') for t in tlist: t.start() print('wait for thread') for t in tlist: t.join() pass
def crawl_index(mode='start', start_date=dt.datetime(2018, 1, 1), end_date=dt.datetime.now()): ''' mode(str):使用的模式,有'start','continue'。'start'代表爬虫重新启动,'continue'代表使用原有数据进行爬取 start_date(dt.datetime):开始日期,当mode为start的时候,作为爬取新闻开始的日期 ''' tlist = [] conn = get_db_conn() cursor = conn.cursor() if mode == 'start': lid_list = get_lid_list() etime = get_time_num(start_date) for lid in lid_list: cursor.execute('INSERT INTO index_to_be_get VALUES(%d,%d,%d);' % (etime, lid, 1)) tlist.append( threading.Thread(target=crawl_l, args=(lid, start_date, 1, end_date))) conn.commit() elif mode == 'continue': cursor.execute('SELECT * FROM index_to_be_get;') data = cursor.fetchall() for etime, lid, page in data: s = num2date(etime) tlist.append( threading.Thread(target=crawl_l, args=(lid, s, page, end_date))) conn.close() for t in tlist: t.start() for t in tlist: t.join()
def get_wifi_strength_data(type=0): conn = utils.get_db_conn() cur = conn.cursor() table = 'd_final_index' if type == 0 else 'd_final_handled' sql = 'SELECT wifi_db,COUNT(*) FROM %s GROUP BY wifi_db ORDER BY wifi_db DESC' % table print(sql) cur.execute(sql) x = [] y = [] for r in cur.fetchall(): x.append(-r[0]) y.append(r[1]) return x, y
def main(): # a = A() result_list = [] a = {'a': 1, 'b': 2} yesterday = '20190417' dw_cur = get_db_conn().cursor() sql = 'select * from etl_ctrl_show where sp_eod_dt = %s ' % (yesterday) dw_cur.execute(sql) column = [val[0].lower() for val in dw_cur.description] print column, type(column) rows = dw_cur.fetchall() for row in rows: print row result_dic = dict(zip(column, row)) print result_dic result_list.append(result_dic)
def t(mall_id): conn = utils.get_db_conn() cur = conn.cursor() sql = "SELECT DAYOFWEEK(`timestamp`) w,HOUR(`timestamp`) h FROM behavior_pre WHERE shop_id='%s'" % mall_id cur.execute(sql) dow = [0 for x in range(0, 7)] time = [0 for x in range(0, 24)] for r in cur.fetchall(): dow[r[0] - 1] += 1 time[r[1]] += 1 plot_and_save([1, 2, 3, 4, 5, 6, 7], dow, './figs/消费时间周分布_%s.png' % mall_id, title='消费时间周分布') plot_and_save([x for x in range(0, 24)], time, './figs/消费时间日分布_%s.png' % mall_id, title='消费时间日分布')
def d(): conn = utils.get_db_conn() cur = conn.cursor() sql = 'SELECT d,t,DAYOFWEEK(t) w,HOUR(t) h FROM (SELECT DISTINCT data_id d,time_stamp t FROM d_final_index) AS tmpr ORDER BY h,w' cur.execute(sql) dow = [0 for x in range(0, 7)] time = [0 for x in range(0, 24)] for r in cur.fetchall(): dow[r[2] - 1] += 1 time[r[3]] += 1 plot_and_save([1, 2, 3, 4, 5, 6, 7], dow, './figs/消费时间周分布.png', title='消费时间周分布') plot_and_save([x for x in range(0, 24)], time, './figs/消费时间日分布.png', title='消费时间日分布')
def crawl_l(lid, start_date, start_page, end_date): ''' 爬取一个栏目的index lid(int):栏目id start_date(dt.datetime):开始的日期 start_page(int):开始的页数 end_date(dt.datetime):结束的时间 ''' conn = get_db_conn() cursor = conn.cursor() link_matcher = re.compile('https?://(.*)') page = start_page date = start_date while True: url = get_url(date, lid, page) print('crawling,date=%s,lid=%d,page=%d' % (str(date)[:10], lid, page)) r = requests.get(url) r.raise_for_status() url_list = parse_index(r.text) #插入到数据库 for content_url in url_list: link = link_matcher.match(content_url).group(1) cursor.execute('SELECT insert_url_to_be_get(\'%s\');' % link) result = cursor.fetchall()[0][0] if not result: print('已存在,url=%s' % link) if len(url_list) == 0: #爬取下一天的index date = date + dt.timedelta(days=1) page = 1 if date >= end_date: break else: #爬取下一页的数据 page = page + 1 #更改数据库状态 etime = get_time_num(date) cursor.execute( 'UPDATE index_to_be_get SET page=%d,etime=%d WHERE lid=%d;' % (page, etime, lid)) conn.commit() conn.close()
def insert(sql_insert, lst_to_insert): passo = 700 sucess = True conn = utils.get_db_conn() # sql_update = "UPDATE `retorno_diario` SET `qtde_consultas` = {0} WHERE `id` = {1};" sql_topo = sql_insert contador = 0 try: with conn.cursor() as cursor: sub_list = [] for e in lst_to_insert: sub_list.append(e) contador += 1 if contador == passo: # chegou ao maximo sql = sql_topo + ", \n".join(sub_list) cursor.execute(sql) # print(sql + "\n") sub_list.clear() # limpa a lista para inserir mais dados contador = 0 # insera a sobra da lista antes de passo if len(sub_list) > 0: sql = sql_topo + ", \n".join(sub_list) cursor.execute(sql) # print(sql + "\n") except Exception as e: traceback.print_exc() sucess = False conn.rollback() else: conn.commit() finally: conn.close() return sucess
class EtlManager: ''' def findChild( string, nodes, result_list): children =[] temps = (a['producer_children'] for a in result_list if a['producer_father'] ==string) for temp in temps.split(","): child = Node(temp.lower(),None) EtlManager.findChild(temp.lower(),nodes,result_list) children.append(child) node = Node(string,children) nodes.append(node) ''' if __name__ == "__main__": cn = get_db_conn().cursor() ssql = "select * from etl_manager" cn.execute(ssql) result_list = [] colunms = [colunm[0].lower() for colunm in cn.description] results = cn.fetchall() producer_father_list = [] for result in results: producer_father_list.append(result[1].lower()) result_dict = dict(zip(colunms, result)) result_list.append(result_dict) #print result_list[0] #变成Treenode roots = [] T = MultiTree('tree') for result in result_list: val = result['producer_father'].lower() T.add(TreeNode(val)) for temp in result['producer_children'].split(","): T.add(TreeNode(temp.lower()), TreeNode(val)) T.show_tree() set = [] set = postorder(T.tree)
def get_db_conn(self): return utils.get_db_conn(self.tidb_host, self.tidb_port, \ self.tidb_user, self.tidb_pass, self.tidb_db)
def get_db_conn(self): return utils.get_db_conn(self.memsql_host, self.memsql_port, \ self.memsql_user, self.memsql_pass, self.memsql_db)
def atualizar(): """ formato: Open,High,Low,Close,Volume Data eh de hoje ol2l3pv referencia: http://wern-ancheta.com/blog/2015/04/05/getting-started-with-the-yahoo-finance-api/ vo chamar tudo de uma vez: http://finance.yahoo.com/d/quotes.csv?s=GOOGL,AAPL,MSFT,FB&f=abo traz na mesma sequencia passada http://finance.yahoo.com/d/quotes.csv?s=ABEV3.SA&f=ol2l3pv High e Low pode nao ter """ conn = utils.get_db_conn() agora = datetime.datetime.now().strftime(utils.formato_us) lst_insert = [] lst_codigo = main.geraln_query('SELECT id, cod FROM `empresa`') codigos = [i[1]+".SA" for i in lst_codigo] mapas = {i[1]+".SA": i[0] for i in lst_codigo} # como eu excluir alguns as id's podem nao estar em sequencia url = "http://finance.yahoo.com/d/quotes.csv?s=" + ",".join(codigos) + "&f=ol2l3pv" try: r = requests.get(url) if r.ok: conteudo = r.text.split("\n") for index, elemento in enumerate(conteudo): tmplst = elemento.split(',') if len(tmplst) != 5: continue try: _open = float(tmplst[0]) except ValueError as e: _open = 0.0 try: _high = float(tmplst[1]) except ValueError as e: _high = 0.0 try: _low = float(tmplst[2]) except ValueError as e: _low = 0.0 try: _close = float(tmplst[3]) except ValueError as e: print("Exesse valor deveria existir(", codigos[index], ")") raise Exception("Valor de fechamento eh crucial") _close = 0.0 try: _volume = int(tmplst[4]) except ValueError as e: _volume = 0.0 dono = mapas[codigos[index]] lst_insert.append(["NULL", dono, agora, _open, _high, _low, _close, _volume]) if len(lst_insert) > 0: main.biginsert(lst_insert, conn) finally: conn.close()
import os import subprocess import time query_out_dir = sys.argv[1] plan_pickle_path = sys.argv[2] d = dict() files = os.listdir(query_out_dir) i = 0 for file_name in files: name = file_name.replace('.sql', '') with open(query_out_dir + '/' + file_name, 'r') as f: q_text = f.read() close_server() start_server() conn = get_db_conn() p = get_plan(conn, q_text) conn.close() d[name] = p pprint.pprint(p) if i % 100 == 0: print(i) i += 1 close_server() pickle.dump(d, open(plan_pickle_path, 'wb'))
import errno import logging import traceback import json import urllib import hashlib import psycopg2 #user module import eventloop import utils BUF_SIZE = 32 * 1024 conn = utils.get_db_conn(True) __all__ = ["ChatServer"] class UserOnServer(object): _STATE = eventloop.EPOLL_IN | eventloop.EPOLL_ERR | eventloop.EPOLL_HUP def __init__(self, server, fd_to_users, userid_login_status, userid_to_fd, loop, user_sock): self._closed = False self._userid = None self._logined = False self._readbuf = "" self._writebuf = "" self._server = server
def preenche_evento(): """ Busca no site e atualiza a tabela de eventos """ resposta = requests.get(url_eventos) lst_saida = [] ir = False if resposta.ok: ir = True parse = bs4.BeautifulSoup(resposta.text, 'html.parser') lst = parse.find_all("table") filtro = [] inome = 0 iinscricao = 2 ivagas = 3 agora = datetime.datetime.now().strftime(utils.foramto_full_db) for i in lst: tmp = i.get("class") if tmp and "bordas" in tmp: filtro.append(i) if len(filtro) > 0: alvo = filtro[0] lst_linha = alvo.find_all('tr')[2:] for j in lst_linha: lst_coluna = j.find_all('td') nome = inicio_inscricao = fim_inscricao = link = "" k = lst_coluna[inome] nome = k.font.a.get_text().strip() link = url_eventos + k.font.a.get('href').strip() k = lst_coluna[iinscricao] inscricao = k.font.get_text().strip() dinscricao = extrari_inscricao(inscricao) inicio_inscricao = dinscricao["inicio_inscricao"] fim_inscricao = dinscricao["fim_inscricao"] k = lst_coluna[ivagas] vagas = k.span.font.get_text().strip() lst_saida.append([ "NULL", agora, nome, inicio_inscricao, fim_inscricao, link, vagas ]) else: print("pagina forma do ar") return template = "({}, '{}', '{}', '{}', '{}', '{}', {});" if ir: print("-- evento") conn = utils.get_db_conn() lst_pagina = sorted(lst_saida, key=lambda elemento: elemento[2]) nomes_pagina = [i[2] for i in lst_pagina] try: with conn.cursor() as cursor: sql_vazio = "SELECT count(*) FROM `evento`" if utils.is_empty(cursor, sql_vazio): sql_inserir = "INSERT INTO `evento` VALUES " preenche_vazio(cursor, lst_saida, template, sql_inserir) else: # atulizacao, dados ja existem sql_todos = "SELECT * FROM `evento` ORDER BY `nome`" nome_todos = "SELECT `nome` FROM `evento` ORDER BY `nome`" lst_nome_todos = utils.get_all(cursor, nome_todos) lst_nome_todos = [i[0] for i in lst_nome_todos] lst_todos = utils.get_all(cursor, sql_todos) # atualizar lst_atualizar = [] lst_inserir = [] for k in lst_pagina: # print(repr(k[2])) if k[2] in lst_nome_todos: lst_atualizar.append(k) else: lst_inserir.append(k) # remover lst_remover = [] for k1 in lst_todos: if k1[2] not in nomes_pagina: lst_remover.append(k1) if len(lst_remover): print("exlui", len(lst_remover)) excluir(cursor, lst_remover, 'evento') conn.commit() lst_todos = utils.get_all(cursor, sql_todos) # datas e vagas lst_nomes = [g[2] for g in lst_todos] chaves = {g[2]: g[0] for g in lst_todos} if len(lst_atualizar): # elementos que vem da pagina sql = "UPDATE `evento` SET `inicio_inscricao`='{}'" + \ ", `fim_inscricao`='{}', `link`='{}',`vagas`={}" +\ " WHERE `id`={}" print("atualiza", len(lst_atualizar)) for i in lst_atualizar: tmp_lst = i[3:] + [chaves[i[2]]] sql_completo = sql.format(*tmp_lst) # print(sql_completo) cursor.execute(sql_completo) if len(lst_inserir): print("novos", len(lst_inserir)) sql_inserir = "INSERT INTO `evento` VALUES " preenche_vazio(cursor, lst_inserir, template, sql_inserir) """ meu proprio controle: remover eventos que nao tem vaga, ou terminou a data de inscricao casos especiais: A evento novo na pagina B evento nao existe mais na pagina *. trazer os eventos ordenas por nome. (ordernar os eventos aqui tmb) (essa parte vai ser igual ao do feed) A. not (nome in lst_nomes) => elemento novo B. banco = banco[:len(pagina)] # oque sobra do tamanho da pagina eh pq foi removido 1. controle da pagina 2. meu controle """ conn.commit() finally: conn.close()
def __init__(self): print("初始化程序") self.adb = ADB() self.verifCodeUtil = VeriCodeUtil() self.mysql = utils.get_db_conn('localhost', 3306, 'root', 'root', 'test')
import utils if __name__ == '__main__': conn = utils.get_db_conn() cur = conn.cursor() test_file_path = 'E:/tianchi/AB_test.csv' # 测试集文件路径 # 创建测试集初始表 print("开始创建测试集初始表") sql = """ CREATE TABLE IF NOT EXISTS `data_test` ( `row_id` int(11) DEFAULT NULL, `user_id` varchar(255) DEFAULT NULL, `mall_id` varchar(255) DEFAULT NULL, `timestamp` varchar(255) DEFAULT NULL, `longitude` varchar(255) DEFAULT NULL, `latitude` varchar(255) DEFAULT NULL, `wifi_infos` varchar(1024) DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8; """ cur.execute(sql) # 从文件加载数据 print("开始从文件加载数据") sql = """ LOAD DATA INFILE '%s' INTO TABLE data_test FIELDS TERMINATED by ',' LINES TERMINATED by '\n' IGNORE 1 LINES """ % test_file_path cur.execute(sql) # 创建临时表 print("开始创建临时表")
def preenche_feed(): """Feito para buscar dados do feed. Lida com dados antigos caso um elemento ja exista na pagina ele nao é atualizado """ resposta = requests.get(url_feed) lst_saida = [] ir = False if resposta.ok: # extrai o conteudo da pagina ir = True parse = bs4.BeautifulSoup(resposta.text, 'html.parser') # /html/body/div/div/div[2]/div[2]/div/ul/li[] xpath div_principal = parse.find(id="main-area-1") filhos = div_principal.children alvo = None for i in filhos: if i.name == "ul": alvo = i break lst_img = alvo.find_all('img') agora = datetime.datetime.now().strftime(utils.foramto_full_db) for k in lst_img: texto = k.get('alt') link_img = url_feed + k.get('src') link = k.parent.get('href') # pode nao existir link = link if link else "NULL" lst_saida.append(["NULL", agora, texto, link_img, link]) else: print("pagina fora do ar") return conn = None template = "({}, '{}', '{}', '{}', '{}');" if ir: print("-- feed") conn = utils.get_db_conn() lst_pagina = lst_saida nomes_pagina = [i[2] for i in lst_pagina] try: with conn.cursor() as cursor: sql_vazio = "SELECT count(*) FROM `feed`" if utils.is_empty(cursor, sql_vazio): sql_inserir = "INSERT INTO `feed` VALUES " preenche_vazio(cursor, lst_saida, template, sql_inserir) else: # atulizacao, dados ja existem sql_todos = "SELECT * FROM `feed` ORDER BY `texto`" nome_todos = "SELECT `texto` FROM `feed` ORDER BY `texto`" lst_nome_todos = utils.get_all(cursor, nome_todos) lst_nome_todos = [i[0] for i in lst_nome_todos] lst_todos = utils.get_all(cursor, sql_todos) # apenas insere ou exclui nao atualiza lst_inserir = [] for k in lst_pagina: # print(repr(k[2])) if k[2] not in lst_nome_todos: lst_inserir.append(k) # remover lst_remover = [] for k1 in lst_todos: if k1[2] not in nomes_pagina: lst_remover.append(k1) if len(lst_remover): print("exlui", len(lst_remover)) excluir(cursor, lst_remover, 'feed') conn.commit() lst_todos = utils.get_all(cursor, sql_todos) if len(lst_inserir): print("novos", len(lst_inserir)) sql_inserir = "INSERT INTO `feed` VALUES " preenche_vazio(cursor, lst_inserir, template, sql_inserir) conn.commit() finally: conn.close()