def download_volume_price_distr(stock_index, now_dt): save_dir = data_dict.get("volume_price_distr") dir_name1 = os.path.join(save_dir, stock_index) out_file_name = save_file_name(dir_name1, stock_index, now_dt) if stock_index[0] == '6': start_string = 'sh' else: start_string = 'sz' html1 = "https://vip.stock.finance.sina.com.cn/quotes_service/view/cn_price.php?symbol=" + start_string + stock_index soup2 = url_opener(html1) soup_out = soup2.findAll('tr', attrs={'class', 'gray'}) soup_out = soup2.findAll('td') data1 = [] for i in soup_out: try: a1 = float(i.get_text()) data1.append(a1) except Exception as e: logging.error(e) pass out_list = [] for i in range(0, len(data1), 2): b = data1[i:i + 2] out_list.append(b) df2 = pd.DataFrame(out_list) df2.columns = ["price", "vol"] df2["stock_index"] = stock_index df2["dt"] = now_dt save_dir = data_dict.get("volume_price_distr") dir_name1 = os.path.join(save_dir, stock_index) make_dir(dir_name1) save_df_date(dir_name1, stock_index, df2, now_dt) time.sleep(3.5) return df2
def download_volume_price_distr(stock_index): if stock_index[0] == '6': start_string = 'sh' else: start_string = 'sz' html1 = "https://vip.stock.finance.sina.com.cn/quotes_service/view/cn_price.php?symbol=" + start_string + stock_index soup2 = url_opener(html1) soup_out = soup2.findAll('tr', attrs={'class', 'gray'}) soup_out = soup2.findAll('td') data1 = [] for i in soup_out: try: a1 = float(i.get_text()) data1.append(a1) except: pass out_list = [] for i in range(0, len(data1), 2): b = data1[i:i + 2] out_list.append(b) df2 = pd.DataFrame(out_list) df2.columns = ["price", "vol"] df2.to_csv(stock_index + ".csv", index=0) return df2
def get_page_num(html1): soup2 = url_opener(html1) a1 = soup2.get_text() reg = r'pages(.{3})' ## find 3 str after page wordreg = re.compile(reg) a2 = re.findall(wordreg, a1) page_num = a2[0][2] return page_num
def get_html_table(url1): ''' return: the html table, and how many rows in the table ''' soup2 = url_opener(url1) table = soup2.find_all('tbody')[1] new_table_index = [x for x in range(0,len(table.find_all('tr')))] return table,new_table_index
def url_to_df(html1): soup2 = url_opener(html1) a2 = soup2.find_all('table') a3 = a2[0].find_all('tr') data_list_all = [] for i in range(2,len(a3)): a5 = a3[i].get_text() data_list = [x.strip() for x in a5.split("\r\n")] data_list = [x for x in data_list if x != ''] data_list_all.append(data_list) df1 = pd.DataFrame(data_list_all) df1.dropna(axis=0, how='any', inplace=True) return df1
def get_html_table(html1): ''' get the table from a html input: html output: table: the table new_table_index: how many rows in the table ''' # html1 = "http://app.finance.ifeng.com/hq/all_stock_bill.php" soup2 = url_opener(html1) table = soup2.find_all('table')[0] # Grab the first table new_table_index = [x for x in range(0, len(table.find_all('tr')))] return table, new_table_index
def download_html_to_df(html1): ''' download html do DataFrame, @input: a html url, @output: a pandas DataFrame ''' soup2 = url_opener(html1) soup_out = soup2.findAll('a', href=True) ## get date, which '2018-10-30' start with #2 dates_in = [ dates_str['href'][1:11] for dates_str in soup_out if dates_str['href'].startswith("#2") ] tr_str = soup2.findAll('tr') #a1=t3[10] #------------------------------------------------------------# #--------- prase log to data ----------------------# data = [] for a1 in tr_str: cols = a1.findAll('td') cols = [ele.text.strip() for ele in cols] publish_date = '2099-01-01' if len(cols) == 2: if cols[0] == '公告日期': publish_date = cols[1] else: pass if len(cols) == 5: data.append([ele for ele in cols if ele] + [publish_date]) else: pass #print(data) import pandas as pd data1 = pd.DataFrame(data) data1.columns = [ 'index_in', 'owner_name', 'amount', 'ratio', 'character', 'publish_date' ] #--------------------------------------------------------------------# #------- make date date align with data #data1.index_in.value_counts() date_all = [] k = -1 for index1 in data1.index_in: if index1 == '1': k += 1 dates_in1 = dates_in[k] date_all.append(dates_in1) data1['date'] = date_all return data1
def download_data(stock_index,year,season): url = 'http://quotes.money.163.com/trade/lsjysj_%s.html?year=%s&season=%s'%(stock_index,str(year),str(season)) soup1 = url_opener(url) table = soup1.find_all('table',\ attrs={'class','table_bg001 border_box limit_sale'}) all_tr = table[0].findAll('tr') str_in = [] for i in all_tr: a1 = i.find_all('td') a2 = [x.get_text() for x in a1] str_in.append(a2) df1 = pd.DataFrame(str_in) df1 = df1.iloc[1:] df1['stock_index'] = stock_index return df1
def download_owner(url1): ## get the table in web soup1 = url_opener(url1) t1 = soup1.find_all("table", attrs={'id': 'Table1'}) tab1 = t1[0] t3 = tab1.find_all('tr') #print len(t3) # t4=t3[3] ## the six one is the column names of the table data = [] for t4 in t3[3:]: t5 = t4.find_all('td') cols = [ele.text.split(' ')[0] for ele in t5] cols[0] = cols[0].replace("\t", "") data.append([ele for ele in cols if ele]) return data
def find_url(stk_num): html1="http://vip.stock.finance.sina.com.cn/q/go.php/vReport_List/kind/search/index.phtml?symbol=%s&t1=all" %str(stk_num) soup2=url_opener(html1) a1=soup2.find_all('td',attrs={'class','tal f14'}) return a1
def get_html_table(html1): # html1 = "http://app.finance.ifeng.com/hq/all_stock_bill.php" soup2 = url_opener(html1) table = soup2.find_all('table')[0] # Grab the first table new_table_index = [x for x in range(0, len(table.find_all('tr')))] return table, new_table_index
from davidyu_cfg import * from functions.connect_url import url_opener h1 = url_opener("http://snap.stanford.edu/class/cs224w-2012/handouts.html") a1 = h1.findAll('a') for a2 in a1: a3 = a2.get('href') if '.pdf' in a3: print(a3)
else: textOut=text1[0].get_text().encode('latin1',"ignore").decode('gb2312',"ignore") return date,textOut from dir_control.data_dir_v1 import data_dict,stk_index_list import time stk_index_list=[x for x in stk_index_list if str(x).zfill(6)[0]!='3'] k=0 for stk in stk_index_list[0:2]: stk=str(stk).zfill(6) conts=[] content1=find_url(stk) k=0 for ss in content1: k+=1 con_url=ss.find('a').get('href') soup2=url_opener(con_url) #conts.append(strs) date,textout=content_get(soup2) file1=str(stk)+'_'+str(date)+'_'+str(k)+'.txt' f=open(file1,'w') print(textout, file = f) f.close() print(k) time.sleep(2.5) #jieba.lcut(t3) #date,textout=content_get(soup2) '''
def find_url(url): soup2 = url_opener(url) a1 = soup2.find_all(id='BalanceSheetNewTable0') soup1 = a1[0] return soup1
def news_in_html_url(url): con_url = 'http:' + url.find('a').get('href') soup2 = url_opener(con_url) date, textout = content_get(soup2) return date, textout
def find_url_content(self): soup2 = url_opener(self.html) a1 = soup2.find_all('td', attrs={'class', 'tal f14'}) return a1
def get_html_table(url1): soup2 = url_opener(url1) table = soup2.find_all('tbody')[1] new_table_index = [x for x in range(0, len(table.find_all('tr')))] return table, new_table_index