Example #1
0
def crawl_content_in_queue(q):
    '''
	爬取queue中url对应的content
	'''
    conn = get_db_conn()
    cursor = conn.cursor()
    while True:
        try:
            url = q.get(block=True, timeout=3)
        except Empty:
            #队列空,重试
            print('queue empty.retry...')
            continue
        if url is None:
            #任务结束
            break
            print('done')
        else:
            print('url=%s' % url)
        page = get_page('http://' + url)
        row = parse_content(page)
        #存入数据库
        if row['title'] != '' and row['content'] != '':
            command='SELECT insert_page_content($PGDATA$%s$PGDATA$,'+\
            '$PGDATA$%s$PGDATA$,$PGDATA$%s$PGDATA$,$PGDATA$%s$PGDATA$,$PGDATA$%s$PGDATA$);'
            cursor.execute(command % (url, row['title'], row['content'],
                                      row['keywords'], row['tags']))
            is_duplicate = not cursor.fetchall()[0][0]
            if is_duplicate:
                print('已存在.url=%s' % url)
            conn.commit()
        else:
            print('empty.url=%s' % url)
    conn.close()
    pass
Example #2
0
def crawl_content_in_db():
    '''
	爬取db中的url对应的content
	'''
    conn = get_db_conn()
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM url_to_be_get;')
    url_table = cursor.fetchall()
    q = Queue()
    for row in url_table:
        q.put(row[0])
    for i in range(THREAD_NUM):
        q.put(None)
    print('queue init done.')

    tlist = [
        threading.Thread(target=crawl_content_in_queue, args=(q, ))
        for i in range(THREAD_NUM)
    ]
    print('start thread')
    for t in tlist:
        t.start()
    print('wait for thread')
    for t in tlist:
        t.join()
    pass
Example #3
0
def crawl_index(mode='start',
                start_date=dt.datetime(2018, 1, 1),
                end_date=dt.datetime.now()):
    '''
	mode(str):使用的模式,有'start','continue'。'start'代表爬虫重新启动,'continue'代表使用原有数据进行爬取
	start_date(dt.datetime):开始日期,当mode为start的时候,作为爬取新闻开始的日期
	'''
    tlist = []
    conn = get_db_conn()
    cursor = conn.cursor()
    if mode == 'start':
        lid_list = get_lid_list()
        etime = get_time_num(start_date)
        for lid in lid_list:
            cursor.execute('INSERT INTO index_to_be_get VALUES(%d,%d,%d);' %
                           (etime, lid, 1))
            tlist.append(
                threading.Thread(target=crawl_l,
                                 args=(lid, start_date, 1, end_date)))
            conn.commit()
    elif mode == 'continue':
        cursor.execute('SELECT * FROM index_to_be_get;')
        data = cursor.fetchall()
        for etime, lid, page in data:
            s = num2date(etime)
            tlist.append(
                threading.Thread(target=crawl_l,
                                 args=(lid, s, page, end_date)))
    conn.close()

    for t in tlist:
        t.start()
    for t in tlist:
        t.join()
Example #4
0
def get_wifi_strength_data(type=0):
    conn = utils.get_db_conn()
    cur = conn.cursor()
    table = 'd_final_index' if type == 0 else 'd_final_handled'
    sql = 'SELECT wifi_db,COUNT(*) FROM %s GROUP BY wifi_db ORDER BY wifi_db DESC' % table
    print(sql)
    cur.execute(sql)
    x = []
    y = []
    for r in cur.fetchall():
        x.append(-r[0])
        y.append(r[1])
    return x, y
Example #5
0
def main():
    # a = A()
    result_list = []
    a = {'a': 1, 'b': 2}
    yesterday = '20190417'
    dw_cur = get_db_conn().cursor()
    sql = 'select * from etl_ctrl_show where sp_eod_dt = %s ' % (yesterday)
    dw_cur.execute(sql)
    column = [val[0].lower() for val in dw_cur.description]
    print column, type(column)
    rows = dw_cur.fetchall()
    for row in rows:
        print row
        result_dic = dict(zip(column, row))
        print result_dic
        result_list.append(result_dic)
Example #6
0
def t(mall_id):
    conn = utils.get_db_conn()
    cur = conn.cursor()
    sql = "SELECT DAYOFWEEK(`timestamp`) w,HOUR(`timestamp`) h FROM behavior_pre WHERE shop_id='%s'" % mall_id
    cur.execute(sql)
    dow = [0 for x in range(0, 7)]
    time = [0 for x in range(0, 24)]
    for r in cur.fetchall():
        dow[r[0] - 1] += 1
        time[r[1]] += 1
    plot_and_save([1, 2, 3, 4, 5, 6, 7],
                  dow,
                  './figs/消费时间周分布_%s.png' % mall_id,
                  title='消费时间周分布')
    plot_and_save([x for x in range(0, 24)],
                  time,
                  './figs/消费时间日分布_%s.png' % mall_id,
                  title='消费时间日分布')
Example #7
0
def d():
    conn = utils.get_db_conn()
    cur = conn.cursor()
    sql = 'SELECT d,t,DAYOFWEEK(t) w,HOUR(t) h FROM (SELECT DISTINCT data_id d,time_stamp  t FROM d_final_index) AS tmpr ORDER BY h,w'
    cur.execute(sql)
    dow = [0 for x in range(0, 7)]
    time = [0 for x in range(0, 24)]
    for r in cur.fetchall():
        dow[r[2] - 1] += 1
        time[r[3]] += 1
    plot_and_save([1, 2, 3, 4, 5, 6, 7],
                  dow,
                  './figs/消费时间周分布.png',
                  title='消费时间周分布')
    plot_and_save([x for x in range(0, 24)],
                  time,
                  './figs/消费时间日分布.png',
                  title='消费时间日分布')
Example #8
0
def crawl_l(lid, start_date, start_page, end_date):
    '''
	爬取一个栏目的index
	lid(int):栏目id
	start_date(dt.datetime):开始的日期
	start_page(int):开始的页数
	end_date(dt.datetime):结束的时间
	'''
    conn = get_db_conn()
    cursor = conn.cursor()
    link_matcher = re.compile('https?://(.*)')
    page = start_page
    date = start_date
    while True:
        url = get_url(date, lid, page)
        print('crawling,date=%s,lid=%d,page=%d' % (str(date)[:10], lid, page))
        r = requests.get(url)
        r.raise_for_status()
        url_list = parse_index(r.text)
        #插入到数据库
        for content_url in url_list:
            link = link_matcher.match(content_url).group(1)
            cursor.execute('SELECT insert_url_to_be_get(\'%s\');' % link)
            result = cursor.fetchall()[0][0]
            if not result:
                print('已存在,url=%s' % link)

        if len(url_list) == 0:
            #爬取下一天的index
            date = date + dt.timedelta(days=1)
            page = 1
            if date >= end_date:
                break
        else:
            #爬取下一页的数据
            page = page + 1

        #更改数据库状态
        etime = get_time_num(date)
        cursor.execute(
            'UPDATE index_to_be_get SET page=%d,etime=%d WHERE lid=%d;' %
            (page, etime, lid))
        conn.commit()
    conn.close()
Example #9
0
def insert(sql_insert, lst_to_insert):
    passo = 700
    sucess = True
    conn = utils.get_db_conn()
    # sql_update = "UPDATE `retorno_diario` SET `qtde_consultas` = {0} WHERE `id` = {1};"
    sql_topo = sql_insert
    contador = 0
    try:
        with conn.cursor() as cursor:

            sub_list = []
            for e in lst_to_insert:
                sub_list.append(e)
                contador += 1

                if contador == passo:  # chegou ao maximo
                    sql = sql_topo + ", \n".join(sub_list)
                    cursor.execute(sql)
                    # print(sql + "\n")
                    sub_list.clear()  # limpa a lista para inserir mais dados
                    contador = 0

            # insera a sobra da lista antes de passo
            if len(sub_list) > 0:
                sql = sql_topo + ", \n".join(sub_list)
                cursor.execute(sql)
                # print(sql + "\n")

    except Exception as e:
        traceback.print_exc()
        sucess = False
        conn.rollback()
    else:
        conn.commit()
    finally:
        conn.close()

    return sucess
Example #10
0
class EtlManager:
    '''
    def findChild( string, nodes, result_list):
        children =[]
        temps = (a['producer_children'] for a in result_list if a['producer_father'] ==string)
        for temp in temps.split(","):

            child  = Node(temp.lower(),None)
            EtlManager.findChild(temp.lower(),nodes,result_list)
            children.append(child)
        node = Node(string,children)
        nodes.append(node)
    '''
    if __name__ == "__main__":
        cn = get_db_conn().cursor()
        ssql = "select * from etl_manager"
        cn.execute(ssql)
        result_list = []
        colunms = [colunm[0].lower() for colunm in cn.description]
        results = cn.fetchall()
        producer_father_list = []
        for result in results:
            producer_father_list.append(result[1].lower())
            result_dict = dict(zip(colunms, result))
            result_list.append(result_dict)
        #print result_list[0]
        #变成Treenode
        roots = []
        T = MultiTree('tree')
        for result in result_list:
            val = result['producer_father'].lower()
            T.add(TreeNode(val))
            for temp in result['producer_children'].split(","):
                T.add(TreeNode(temp.lower()), TreeNode(val))
        T.show_tree()
        set = []
        set = postorder(T.tree)
Example #11
0
 def get_db_conn(self):
     return utils.get_db_conn(self.tidb_host, self.tidb_port, \
                 self.tidb_user, self.tidb_pass, self.tidb_db)
Example #12
0
 def get_db_conn(self):
     return utils.get_db_conn(self.memsql_host, self.memsql_port, \
                 self.memsql_user, self.memsql_pass, self.memsql_db)
Example #13
0
def atualizar():
    """
    formato:  Open,High,Low,Close,Volume
    Data eh de hoje
    ol2l3pv
    referencia: http://wern-ancheta.com/blog/2015/04/05/getting-started-with-the-yahoo-finance-api/

    vo chamar tudo de uma vez:
    http://finance.yahoo.com/d/quotes.csv?s=GOOGL,AAPL,MSFT,FB&f=abo
    traz na mesma sequencia passada

    http://finance.yahoo.com/d/quotes.csv?s=ABEV3.SA&f=ol2l3pv
    High e Low pode nao ter
    """
    conn = utils.get_db_conn()
    agora = datetime.datetime.now().strftime(utils.formato_us)
    lst_insert = []
    lst_codigo = main.geraln_query('SELECT id, cod FROM `empresa`')
    codigos = [i[1]+".SA" for i in lst_codigo]
    mapas = {i[1]+".SA": i[0] for i in lst_codigo}  # como eu excluir alguns as id's podem nao estar em sequencia
    url = "http://finance.yahoo.com/d/quotes.csv?s=" + ",".join(codigos) + "&f=ol2l3pv"

    try:
        r = requests.get(url)
        if r.ok:
            conteudo = r.text.split("\n")
            for index, elemento in enumerate(conteudo):
                tmplst = elemento.split(',')
                if len(tmplst) != 5:
                    continue
                try:
                    _open = float(tmplst[0])
                except ValueError as e:
                    _open = 0.0

                try:
                    _high = float(tmplst[1])
                except ValueError as e:
                    _high = 0.0

                try:
                    _low = float(tmplst[2])
                except ValueError as e:
                    _low = 0.0

                try:
                    _close = float(tmplst[3])
                except ValueError as e:
                    print("Exesse valor deveria existir(", codigos[index], ")")
                    raise Exception("Valor de fechamento eh crucial")
                    _close = 0.0

                try:
                    _volume = int(tmplst[4])
                except ValueError as e:
                    _volume = 0.0

                dono = mapas[codigos[index]]
                lst_insert.append(["NULL", dono, agora, _open, _high, _low, _close, _volume])

            if len(lst_insert) > 0:

                main.biginsert(lst_insert, conn)
    finally:
        conn.close()
Example #14
0
 def get_db_conn(self):
     return utils.get_db_conn(self.memsql_host, self.memsql_port, \
                 self.memsql_user, self.memsql_pass, self.memsql_db)
Example #15
0
import os
import subprocess
import time

query_out_dir = sys.argv[1]
plan_pickle_path = sys.argv[2]

d = dict()
files = os.listdir(query_out_dir)

i = 0
for file_name in files:
    name = file_name.replace('.sql', '')
    with open(query_out_dir + '/' + file_name, 'r') as f:
        q_text = f.read()

    close_server()
    start_server()
    conn = get_db_conn()
    p = get_plan(conn, q_text)
    conn.close()
    d[name] = p
    pprint.pprint(p)

    if i % 100 == 0:
        print(i)
    i += 1

close_server()
pickle.dump(d, open(plan_pickle_path, 'wb'))
Example #16
0
import errno
import logging
import traceback
import json
import urllib
import hashlib

import psycopg2

#user module
import eventloop
import utils

BUF_SIZE = 32 * 1024

conn = utils.get_db_conn(True)

__all__ = ["ChatServer"]

class UserOnServer(object):

    _STATE = eventloop.EPOLL_IN | eventloop.EPOLL_ERR | eventloop.EPOLL_HUP
    def __init__(self, server, fd_to_users, userid_login_status,
            userid_to_fd, loop, user_sock):
        self._closed = False
        self._userid = None
        self._logined = False
        self._readbuf = ""
        self._writebuf = ""

        self._server = server
Example #17
0
def preenche_evento():
    """
    Busca no site e atualiza a tabela de eventos
    """
    resposta = requests.get(url_eventos)

    lst_saida = []
    ir = False
    if resposta.ok:
        ir = True
        parse = bs4.BeautifulSoup(resposta.text, 'html.parser')
        lst = parse.find_all("table")
        filtro = []
        inome = 0
        iinscricao = 2
        ivagas = 3
        agora = datetime.datetime.now().strftime(utils.foramto_full_db)
        for i in lst:
            tmp = i.get("class")
            if tmp and "bordas" in tmp:
                filtro.append(i)

        if len(filtro) > 0:
            alvo = filtro[0]
            lst_linha = alvo.find_all('tr')[2:]
            for j in lst_linha:
                lst_coluna = j.find_all('td')
                nome = inicio_inscricao = fim_inscricao = link = ""
                k = lst_coluna[inome]
                nome = k.font.a.get_text().strip()
                link = url_eventos + k.font.a.get('href').strip()

                k = lst_coluna[iinscricao]
                inscricao = k.font.get_text().strip()
                dinscricao = extrari_inscricao(inscricao)
                inicio_inscricao = dinscricao["inicio_inscricao"]
                fim_inscricao = dinscricao["fim_inscricao"]

                k = lst_coluna[ivagas]
                vagas = k.span.font.get_text().strip()

                lst_saida.append([
                    "NULL", agora, nome, inicio_inscricao, fim_inscricao, link,
                    vagas
                ])
    else:
        print("pagina forma do ar")
        return

    template = "({}, '{}', '{}', '{}', '{}', '{}', {});"
    if ir:
        print("-- evento")
        conn = utils.get_db_conn()
        lst_pagina = sorted(lst_saida, key=lambda elemento: elemento[2])
        nomes_pagina = [i[2] for i in lst_pagina]
        try:
            with conn.cursor() as cursor:
                sql_vazio = "SELECT count(*) FROM `evento`"
                if utils.is_empty(cursor, sql_vazio):
                    sql_inserir = "INSERT INTO `evento` VALUES "
                    preenche_vazio(cursor, lst_saida, template, sql_inserir)
                else:
                    # atulizacao, dados ja existem
                    sql_todos = "SELECT * FROM `evento` ORDER BY `nome`"
                    nome_todos = "SELECT `nome` FROM `evento` ORDER BY `nome`"
                    lst_nome_todos = utils.get_all(cursor, nome_todos)
                    lst_nome_todos = [i[0] for i in lst_nome_todos]
                    lst_todos = utils.get_all(cursor, sql_todos)

                    # atualizar
                    lst_atualizar = []
                    lst_inserir = []
                    for k in lst_pagina:
                        # print(repr(k[2]))
                        if k[2] in lst_nome_todos:
                            lst_atualizar.append(k)
                        else:
                            lst_inserir.append(k)

                    # remover
                    lst_remover = []
                    for k1 in lst_todos:
                        if k1[2] not in nomes_pagina:
                            lst_remover.append(k1)

                    if len(lst_remover):
                        print("exlui", len(lst_remover))
                        excluir(cursor, lst_remover, 'evento')
                        conn.commit()
                        lst_todos = utils.get_all(cursor, sql_todos)

                    # datas e vagas
                    lst_nomes = [g[2] for g in lst_todos]
                    chaves = {g[2]: g[0] for g in lst_todos}

                    if len(lst_atualizar):  # elementos que vem da pagina
                        sql = "UPDATE `evento` SET `inicio_inscricao`='{}'" + \
                            ", `fim_inscricao`='{}', `link`='{}',`vagas`={}" +\
                            " WHERE `id`={}"

                        print("atualiza", len(lst_atualizar))
                        for i in lst_atualizar:
                            tmp_lst = i[3:] + [chaves[i[2]]]
                            sql_completo = sql.format(*tmp_lst)
                            # print(sql_completo)
                            cursor.execute(sql_completo)

                    if len(lst_inserir):
                        print("novos", len(lst_inserir))
                        sql_inserir = "INSERT INTO `evento` VALUES "
                        preenche_vazio(cursor, lst_inserir, template,
                                       sql_inserir)
                    """
                    meu proprio controle:
                        remover eventos que nao tem vaga, ou terminou a data de inscricao
                    casos especiais:
                        A evento novo na pagina
                        B evento nao existe mais na pagina
                    *. trazer os eventos ordenas por nome. (ordernar os eventos aqui tmb)
                    (essa parte vai ser igual ao do feed)
                    A.  not (nome in lst_nomes) => elemento novo
                    B.  banco = banco[:len(pagina)] # oque sobra do tamanho da pagina eh pq foi removido


                    1. controle da pagina
                    2. meu controle

                    """
            conn.commit()

        finally:
            conn.close()
 def __init__(self):
     print("初始化程序")
     self.adb = ADB()
     self.verifCodeUtil = VeriCodeUtil()
     self.mysql = utils.get_db_conn('localhost', 3306, 'root', 'root',
                                    'test')
Example #19
0
import utils

if __name__ == '__main__':
    conn = utils.get_db_conn()
    cur = conn.cursor()
    test_file_path = 'E:/tianchi/AB_test.csv'  # 测试集文件路径
    # 创建测试集初始表
    print("开始创建测试集初始表")
    sql = """
        CREATE TABLE IF NOT EXISTS  `data_test` (
      `row_id` int(11) DEFAULT NULL,
      `user_id` varchar(255) DEFAULT NULL,
      `mall_id` varchar(255) DEFAULT NULL,
      `timestamp` varchar(255) DEFAULT NULL,
      `longitude` varchar(255) DEFAULT NULL,
      `latitude` varchar(255) DEFAULT NULL,
      `wifi_infos` varchar(1024) DEFAULT NULL
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    """
    cur.execute(sql)
    # 从文件加载数据
    print("开始从文件加载数据")
    sql = """
    LOAD DATA INFILE '%s' INTO TABLE data_test
    FIELDS TERMINATED by ','
    LINES TERMINATED by '\n'
    IGNORE 1 LINES
    """ % test_file_path
    cur.execute(sql)
    # 创建临时表
    print("开始创建临时表")
Example #20
0
def preenche_feed():
    """Feito para buscar dados do feed. Lida com dados antigos

    caso um elemento ja exista na pagina ele nao é atualizado
    """
    resposta = requests.get(url_feed)

    lst_saida = []
    ir = False
    if resposta.ok:  # extrai o conteudo da pagina
        ir = True
        parse = bs4.BeautifulSoup(resposta.text, 'html.parser')
        # /html/body/div/div/div[2]/div[2]/div/ul/li[] xpath
        div_principal = parse.find(id="main-area-1")
        filhos = div_principal.children
        alvo = None
        for i in filhos:
            if i.name == "ul":
                alvo = i
                break
        lst_img = alvo.find_all('img')
        agora = datetime.datetime.now().strftime(utils.foramto_full_db)
        for k in lst_img:
            texto = k.get('alt')
            link_img = url_feed + k.get('src')
            link = k.parent.get('href')  # pode nao existir
            link = link if link else "NULL"
            lst_saida.append(["NULL", agora, texto, link_img, link])
    else:
        print("pagina fora do ar")
        return

    conn = None
    template = "({}, '{}', '{}', '{}', '{}');"
    if ir:
        print("-- feed")
        conn = utils.get_db_conn()
        lst_pagina = lst_saida
        nomes_pagina = [i[2] for i in lst_pagina]
        try:
            with conn.cursor() as cursor:

                sql_vazio = "SELECT count(*) FROM `feed`"
                if utils.is_empty(cursor, sql_vazio):
                    sql_inserir = "INSERT INTO `feed` VALUES "
                    preenche_vazio(cursor, lst_saida, template, sql_inserir)
                else:
                    # atulizacao, dados ja existem
                    sql_todos = "SELECT * FROM `feed` ORDER BY `texto`"
                    nome_todos = "SELECT `texto` FROM `feed` ORDER BY `texto`"
                    lst_nome_todos = utils.get_all(cursor, nome_todos)
                    lst_nome_todos = [i[0] for i in lst_nome_todos]
                    lst_todos = utils.get_all(cursor, sql_todos)

                    # apenas insere ou exclui nao atualiza
                    lst_inserir = []
                    for k in lst_pagina:
                        # print(repr(k[2]))
                        if k[2] not in lst_nome_todos:
                            lst_inserir.append(k)

                    # remover
                    lst_remover = []
                    for k1 in lst_todos:
                        if k1[2] not in nomes_pagina:
                            lst_remover.append(k1)

                    if len(lst_remover):
                        print("exlui", len(lst_remover))
                        excluir(cursor, lst_remover, 'feed')
                        conn.commit()
                        lst_todos = utils.get_all(cursor, sql_todos)

                    if len(lst_inserir):
                        print("novos", len(lst_inserir))
                        sql_inserir = "INSERT INTO `feed` VALUES "
                        preenche_vazio(cursor, lst_inserir, template,
                                       sql_inserir)

            conn.commit()

        finally:
            conn.close()