Example #1
0
    def return_stock_in_bankuai(self, bankuai):
        # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below
        # ['板块','概念板块','AB股票']
        #
        # return ['板块','概念板块','AB股票',[[code,name],[code,name],...]]
        bankuai_tree = self.__bankuai_tree

        if not bankuai[2] in bankuai_tree[bankuai[0]]["children"][bankuai[1]]["children"]:
            raise RuntimeError,("The url of [" + ",".join(bankuai) + "] is not correct.","in Eastmoney.py")
        
        bankuai_detail_url = self.return_url_for_bankuai_stock(bankuai)
        while True: # Infinite loop unitl stock download completes successfully
            try:
                bankuai_detail_page = read_url(bankuai_detail_url)
                break
            except:
                warn_log('Connection lost, retry in 10 seconds ...')
                time.sleep(10)                
                
        r_return_code_detail_grp = r'\[(?P<code_detail_grp>.*)\]'
        code_detail_grp = re.search(r_return_code_detail_grp, bankuai_detail_page).group("code_detail_grp")

        r_return_code_detail = r'"(?P<code_detail>[^"]*)"'
        r_code_detail = re.compile(r_return_code_detail)
        
        stocks = []
        for m in r_code_detail.finditer(code_detail_grp):
            match_group_into_list = m.group("code_detail").split(",")
            stocks.append([match_group_into_list[1],match_group_into_list[2].decode("utf-8")])
        
        out_list = copy.copy(bankuai)
        out_list.append(stocks)
        return out_list
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file,
                     log_fh, warn_fh):
    #-- iterate stocks, download eod data from webside
    fh = open(to_file, 'a')
    num = 0
    for s in stocks:
        #-- call method of stock object to get content of url
        try:
            new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % {
                'object':
                stock_obj_name,
                'stock':
                s,
                'start_date':
                start_date if stock_obj_name == 'Yahoo_stock' else 'dummy',
                'end_date':
                end_date if stock_obj_name == 'Yahoo_stock' else 'dummy'
            }
            print_log(new_class)

            while True:  # Infinite loop unitl stock download completes successfully
                try:
                    obj = eval(new_class)
                    for k, v in obj.get_stock_content().items():
                        print_log(
                            '%(num)s - Writing %(code)s ...' % {
                                'num': num,
                                'code': k
                            }, log_fh)
                        if re.match(r'pv_none_match', v) or re.match(
                                r'.+"";$',
                                v):  # match empty from tengxun and sina
                            warn_log('No content fetched for ' + k, warn_fh)
                        else:
                            fh.write(v + '\n')
                            num += 1
                    break
                except:
                    warn_log('Connection lost, retry in 10 seconds ...')
                    time.sleep(10)

        except KeyError:
            warn_log(s[0:2] + ' is not setup in ' + stock_obj_name, warn_fh)
            continue
        except HTTPError:  # log and skip for stocks couldn't be returned from yahoo interface
            warn_log('Get content failed when ' + new_class, warn_fh)
            continue
    fh.close()
    print_log(
        '{num} stocks have been written into {file}.'.format(num=num,
                                                             file=to_file),
        log_fh)
Example #3
0
 def return_bankuai_in_bankuai(self, bankuai, sort_direction="desc"):
     # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below
     # ['板块','概念板块']
     # return 
     # ['板块','概念板块',[
     #                    [bankuai_name,increase,amount(in 0.1billion),change_ratio,rising_count,falling_count,leading_stock_code,leading_stock_name,increase],
     #                    [bankuai_name,increase,amount(in 0.1billion),change_ratio,rising_count,falling_count,leading_stock_code,leading_stock_name,increase],
     #                    ...
     #                    ]
     if not sort_direction.lower() in ["desc","asc"]:
         raise RuntimeError,("Incorrect parameter [%(direction)s]" % {"direction": sort_direction},"in Eastmoney.py")
         
     bankuai_url = self.return_url_for_bankuai_bankuai(bankuai)
     while True: # Infinite loop unitl stock download completes successfully
         try:
             bankuai_page = read_url(bankuai_url)
             break
         except:
             warn_log('Connection lost, retry in 10 seconds ...')
             time.sleep(10)
             
     r_return_bankuai_detail_grp = r'\[\[(?P<bankuai_detail_group_desc>[^\]]+)\],\[(?P<bankuai_detail_group_asc>[^\]]+)\]\]'
     match_objs = re.search(r_return_bankuai_detail_grp, bankuai_page)
     bankuai_detail_grp = match_objs.group("bankuai_detail_group_" + sort_direction)
     
     r_return_code_detail = r'"(?P<code_detail>[^"]*)"'
     r_code_detail = re.compile(r_return_code_detail)
     
     bankuais = []
     for m in r_code_detail.finditer(bankuai_detail_grp):
         match_group_into_list = m.group("code_detail").split(",")
         bankuais.append([match_group_into_list[0],match_group_into_list[1],match_group_into_list[2],match_group_into_list[3],match_group_into_list[4],match_group_into_list[5],match_group_into_list[6],match_group_into_list[7],match_group_into_list[8]])
     
     out_list = copy.copy(bankuai)
     out_list.append(bankuais)
     return out_list
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file, log_fh, warn_fh):
    # -- iterate stocks, download eod data from webside
    fh = open(to_file, "a")
    num = 0
    for s in stocks:
        # -- call method of stock object to get content of url
        try:
            new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % {
                "object": stock_obj_name,
                "stock": s,
                "start_date": start_date if stock_obj_name == "Yahoo_stock" else "dummy",
                "end_date": end_date if stock_obj_name == "Yahoo_stock" else "dummy",
            }
            print_log(new_class)

            while True:  # Infinite loop unitl stock download completes successfully
                try:
                    obj = eval(new_class)
                    for k, v in obj.get_stock_content().items():
                        print_log("Writing %(code)s ..." % {"code": k}, log_fh)
                        if re.match(r"pv_none_match", v) or re.match(r'.+"";$', v):  # match empty from tengxun and sina
                            warn_log("No content fetched for " + k, warn_fh)
                        else:
                            fh.write(v + "\n")
                            num += 1
                    break
                except:
                    warn_log("Connection lost, retry in 10 seconds ...")
                    time.sleep(10)

        except KeyError:
            warn_log(s[0:2] + " is not setup in " + stock_obj_name, warn_fh)
            continue
        except HTTPError:  # log and skip for stocks couldn't be returned from yahoo interface
            warn_log("Get content failed when " + new_class, warn_fh)
            continue
    fh.close()
    print_log("{num} stocks have been written into {file}.".format(num=num, file=to_file), log_fh)
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy):

    cur_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d')
    
    stock_list_sql = '''
    select row_id, biz_date, stock_id
    from dw.log_stock_transaction
    where biz_date = '{biz_date}'
    and is_download_success = 'Y'
    and (is_load_success = 'N' or is_load_success is null)
    '''
    if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\''
    
    cur = get_cur(conn)
    while cur_date_dt <= end_date_dt:  
        if merge_before_copy:
        # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete.
            cur_date_str = cur_date_dt.strftime('%Y%m%d')
            working_dir = data_dir + SEP + cur_date_str
            file_merged = os.path.join(working_dir, "file_merged.csv")
            if os.path.exists(file_merged):
                warn_log('Removing old file: ' + file_merged)
                os.remove(file_merged)
            #-- Starting to merge files
            with open(file_merged, "a") as dest:
                i=0
                for _, _, filenames in os.walk(working_dir):
                    for filename in fnmatch.filter(filenames, "[0-9]*.txt"):
                        with open(os.path.join(working_dir, filename)) as src:
                            shutil.copyfileobj(src, dest)
                        i+=1
                        print_log('Merged ' + str(i) + ' files.')
            #-- Deleting records from db
            del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format(cur_date_str)
            get_query_result(conn, del_sql)
            conn.commit()
            print_log('Deletion for biz_date {} completed successfully.'.format(cur_date_str))
            #-- Updating is_load_success to N in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to N')

            #++++++++ Starting to load the merged file into table
            psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')')
            print_log('Successfully loaded {} into table.'.format(file_merged))
            
            #-- Updating is_load_success to Y in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to Y')

            #-- Cleaning up working dir
            os.remove(file_merged)
            
            cur_date_dt = cur_date_dt + datetime.timedelta(1)
            
        else:
            stock_list_sql_var_replaced = stock_list_sql.format(biz_date=cur_date_dt)
            cur.execute(stock_list_sql_var_replaced)
            rows = list(cur)
            for row in rows:
                row_id = row['row_id']
                biz_date = str(row['biz_date']).replace('-','')
                stock_id = row['stock_id']
                while queue.full():
                    print_log('=================> queue is full, wait for 1 second...')
                    time.sleep(1)
                s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy )
                s.start()
                print_log('-----> queue size: ' + str(queue.qsize()))
                conn.commit()
                    
            cur_date_dt = cur_date_dt + datetime.timedelta(1)

    while not queue.empty():
        print_log('=================> queue is not empty yet, wait for 1 second...')
        time.sleep(1)
        else:
            exit_error("table is not correct! [" + options.table + "]")

    #-- replace $DATE, it will check the existence of files, if file doesn't exist, it wouldn't be added to the loading list
    start_dt_dt = datetime.datetime.strptime(start_date, "%Y%m%d")
    end_dt_dt = datetime.datetime.strptime(end_date, "%Y%m%d")

    for k,v in files_to_load.items():
        dt_replaced = []
        process_dt_dt = start_dt_dt
        while process_dt_dt <= end_dt_dt:
            process_dt = datetime.datetime.strftime(process_dt_dt, "%Y%m%d")
            if os.path.isfile(v[0].replace("$DATE", process_dt)):
                dt_replaced.append(v[0].replace("$DATE", process_dt))
            else:
                warn_log(v[0].replace("$DATE", process_dt) + " doesn't exist." )
            process_dt_dt = process_dt_dt + datetime.timedelta(1)
        files_to_load[k] = dt_replaced
    
    #-- Start to load
    for t in load_seq_tables:
        if t in files_to_load:
            for f in files_to_load[t]:
                cmd = "%(func_name)s(%(param)s)" % {"func_name": table_mapping[t]["func_name"], "param": table_mapping[t]["param"]}
                cmd_with_filename = cmd.replace("$f", f.replace('\\', '\\\\')) # replace \\ with \\\\ is just for windows platform, unix/linux platform won't be impacted
                eval(cmd_with_filename)

conn.close()


        else:
            exit_error("table is not correct! [" + options.table + "]")

    # -- replace $DATE, it will check the existence of files, if file doesn't exist, it wouldn't be added to the loading list
    start_dt_dt = datetime.datetime.strptime(start_date, "%Y%m%d")
    end_dt_dt = datetime.datetime.strptime(end_date, "%Y%m%d")

    for k, v in files_to_load.items():
        dt_replaced = []
        process_dt_dt = start_dt_dt
        while process_dt_dt <= end_dt_dt:
            process_dt = datetime.datetime.strftime(process_dt_dt, "%Y%m%d")
            if os.path.isfile(v[0].replace("$DATE", process_dt)):
                dt_replaced.append(v[0].replace("$DATE", process_dt))
            else:
                warn_log(v[0].replace("$DATE", process_dt) + " doesn't exist.")
            process_dt_dt = process_dt_dt + datetime.timedelta(1)
        files_to_load[k] = dt_replaced

    # -- Start to load
    for t in load_seq_tables:
        if t in files_to_load:
            for f in files_to_load[t]:
                cmd = "%(func_name)s(%(param)s)" % {
                    "func_name": table_mapping[t]["func_name"],
                    "param": table_mapping[t]["param"],
                }
                cmd_with_filename = cmd.replace(
                    "$f", f.replace("\\", "\\\\")
                )  # replace \\ with \\\\ is just for windows platform, unix/linux platform won't be impacted
                eval(cmd_with_filename)
def loader(queue,
           conn,
           start_date=options.start_date,
           end_date=options.end_date,
           stock_id=options.stock_id,
           merge_before_copy=options.merge_before_copy,
           enable_copy=options.enable_copy):

    cur_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d')

    stock_list_sql = '''
    select row_id, biz_date, stock_id
    from dw.log_stock_transaction
    where biz_date = '{biz_date}'
    and is_download_success = 'Y'
    and (is_load_success = 'N' or is_load_success is null)
    '''
    if not stock_id is None:
        stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\''

    cur = get_cur(conn)
    while cur_date_dt <= end_date_dt:
        if merge_before_copy:
            # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete.
            cur_date_str = cur_date_dt.strftime('%Y%m%d')
            working_dir = data_dir + SEP + cur_date_str
            file_merged = os.path.join(working_dir, "file_merged.csv")
            if os.path.exists(file_merged):
                warn_log('Removing old file: ' + file_merged)
                os.remove(file_merged)
            #-- Starting to merge files
            with open(file_merged, "a") as dest:
                i = 0
                for _, _, filenames in os.walk(working_dir):
                    for filename in fnmatch.filter(filenames, "[0-9]*.txt"):
                        with open(os.path.join(working_dir, filename)) as src:
                            shutil.copyfileobj(src, dest)
                        i += 1
                        print_log('Merged ' + str(i) + ' files.')
            #-- Deleting records from db
            del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format(
                cur_date_str)
            get_query_result(conn, del_sql)
            conn.commit()
            print_log(
                'Deletion for biz_date {} completed successfully.'.format(
                    cur_date_str))
            #-- Updating is_load_success to N in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format(
                cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to N')

            #++++++++ Starting to load the merged file into table
            psql_copy_from(DB_HOST,
                           DB_NAME,
                           DB_UNAME,
                           'dw.stock_transaction',
                           file_merged,
                           DB_PORT,
                           args=' with (encoding \'GBK\')')
            print_log('Successfully loaded {} into table.'.format(file_merged))

            #-- Updating is_load_success to Y in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format(
                cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to Y')

            #-- Cleaning up working dir
            os.remove(file_merged)

            cur_date_dt = cur_date_dt + datetime.timedelta(1)

        else:
            stock_list_sql_var_replaced = stock_list_sql.format(
                biz_date=cur_date_dt)
            cur.execute(stock_list_sql_var_replaced)
            rows = list(cur)
            for row in rows:
                row_id = row['row_id']
                biz_date = str(row['biz_date']).replace('-', '')
                stock_id = row['stock_id']
                while queue.full():
                    print_log(
                        '=================> queue is full, wait for 1 second...'
                    )
                    time.sleep(1)
                s = Stock_trans_loader(queue,
                                       conn,
                                       row_id,
                                       stock_id,
                                       biz_date,
                                       enable_copy=enable_copy)
                s.start()
                print_log('-----> queue size: ' + str(queue.qsize()))
                conn.commit()

            cur_date_dt = cur_date_dt + datetime.timedelta(1)

    while not queue.empty():
        print_log(
            '=================> queue is not empty yet, wait for 1 second...')
        time.sleep(1)