def nice_fs_crawler(self, bs_type): for iter_ in self.corp_sym_name_df.iterrows(): idx, corp_code, corp_name = iter_[0], iter_[1]["Symbol"], iter_[1][ "Name"] url = "http://media.kisline.com/highlight/mainHighlight.nice?paper_stock={}&nav=1".format( corp_code) res = req.get(url) html_ = bs(res.content, "lxml") table_1 = html_.findAll("table") if len(table_1) <= 13: print("{}에서 문제가 발생했습니다.".format(idx)) else: if bs_type == "개별": table_2_annual = pd.DataFrame(parser.make2d(table_1[5])) table_2_quater = pd.DataFrame(parser.make2d(table_1[6])) else: table_2_annual = pd.DataFrame(parser.make2d(table_1[7])) table_2_quater = pd.DataFrame(parser.make2d(table_1[8])) bs_data = pd.concat([table_2_annual, table_2_quater], 1) bs_data.columns = bs_data.iloc[1, :] bs_data = bs_data.set_index("구분") bs_data.index.name = corp_name, corp_code bs_data.columns.name = "" bs_data = bs_data.drop(["", "구분"]) yield bs_data # 제너레이터 형태로 반환
def financial_statements_data(soup, name): findis = re.compile(name) isstyle = re.compile('.*font-weight:.*;') wonstyle = re.compile('.*단위.*:.*원.*') try: findsoup = soup.find('p', attrs={ 'style': isstyle }, text=findis).find_all_next() except: try: findsoup = soup.find('span', attrs={ 'style': isstyle }, text=findis).find_all_next() except: try: findsoup = soup.find('td', text=findis).find_all_next() except: findsoup = soup.find('p', text=findis).find_all_next() soupis = BeautifulSoup(str(findsoup), 'lxml') wonstyle = re.compile('.*단위.*:.*원.*') findwon = soupis.find(text=wonstyle) unit = re.sub('[^가-힣]', '', findwon) unit = re.sub('단위', '', unit) temp = soupis.find('table', attrs={'border': '1'}) p = parser.make2d(temp) fs_data = pd.DataFrame(p, columns=p[0]) return fs_data, unit
def get_weather(url, df, date): # Open the page and read the html html = urllib.urlopen(url).read() # Create a Beautiful Soup object of the html soup = BeautifulSoup(html, 'html.parser') # Find the table html containing the weather data table = soup.find('table',{'cellspacing':0, 'cellpadding':0, 'id':'obsTable', 'class':'obs-table responsive'}) # Use html_table_parser to convert this data to a two-dimensional list twodim_table = parse.make2d(table) # Delete the first list (this is the columns header) del twodim_table[0] # Convert our two-dimensional list to a DataFrame day_df = pd.DataFrame(twodim_table) # Some days don't report wind chill, which shifts all the other columns over # by one position. If wind chill is reported, we want to drop that column so # all dates have uniform columns. if len(day_df.columns) == 13: day_df.drop(2,axis=1,inplace=True) # Reset the column names day_df.columns = range(12) # Add a column to identify which day this weather data is for day_df['date'] = date # Concatenate this day's DataFrame to the bottom of the DataFrame that # contains all weather data scraped so far concat_df = pd.concat([df, day_df], ignore_index=True) # Return our concatenated DataFrame return concat_df
def scrape_stocks(url): # Cleans up a string and returns a list of the stock's values for a day def sub(s): s = re.sub(',','',s) s = re.sub('\n',',',s) return s.split(',') # Read in the html html = urllib.urlopen(url).read() # Create a BeautifulSoup object for this html soup = BeautifulSoup(html, 'html.parser') # Select the table which contains the stock data table = soup.find('table',{'class':'gf-table historical_price'}) # Select the second row in this table # First row in the data is the headers which we don't need # Due to a quirk in the website, every day's stock price is contained # in the second row. row = table.find('tr').findNext('tr') # Convert the row html into a two-dimensional table month_tbl = parse.make2d(row) # The individual days are separated by '\n\n' in a single string # We split based on this to make each day have it's own list index month_tbl = month_tbl[0][0].split('\n\n') # Remove the commas, then substitute each \n with a comma that we split on month_tbl = [sub(day) for day in month_tbl] # Create a DataFrame from the month's stock data df = pd.DataFrame(month_tbl,columns=cols) # The last day of the month is on top, so we reverse the order of the rows df = df.reindex(index=df.index[::-1]) # Return this DataFrame return df
def get_todayOpen(stock_code): """ :param stock_code: A 제외 종목 코드( ex. 005930 ) :return: 오늘의 시가( int ) """ def get_response(): url = "https://finance.naver.com/item/sise_day.nhn?code={}&page=1".format( stock_code ) res = req.get(url) return res today = datetime.today().strftime("%Y.%m.%d") res = get_response() try: today_open = list( filter( lambda x: today in x, parser.make2d(bs(res.content, "lxml").find("table")), ) ).pop()[3] today_open = int(today_open.replace(",", "")) print(stock_code, today, today_open) ## 종목코드, 날짜, 시가 체크 return today_open except: print(stock_code, today + " No Open Price") ## 오늘 시가( 가격 데이터 ) 없음
def find_table(url2,rcpno): temp=urlopen(url2) r=temp.read() xmlsoup=BeautifulSoup(r,'html.parser') temp=xmlsoup.find_all("script",attrs={"type":"text/javascript"}) txt=temp[7] a=txt.text b=str.find(a,"4. 재무제표") c=a[b:b+200] d=c.split(",")[4] e=d.replace("\"","") e=e.replace("\'","") dcmo=int(e) # 아래 함수로 변경 line85- # url2 = make_report(company_code) # print(url2) # print("here") # #매출액 # report=urlopen(url2) # r=report.read() # xmlsoup=BeautifulSoup(r,'html.parser') # body=xmlsoup.find("body") # table=body.find_all("table") # p = parser.make2d(table[3]) # 매출 정보 등을 가져오기 url3="http://dart.fss.or.kr/report/viewer.do?rcpNo="+rcpno+"&dcmNo="+str(dcmo)+"&eleId=15&offset=297450&length=378975&dtd=dart3.xsd" # http://dart.fss.or.kr/report/viewer.do?rcpNo=20170811001153&dcmNo=5746981&eleId=15 report=urlopen(url3) r=report.read() xmlsoup=BeautifulSoup(r,'html.parser') body=xmlsoup.find("body") table=body.find_all("table") p = parser.make2d(table[3]) name_list=list() value_list=list() name_list.append("구분") for i in range(1,len(p[0])): name=p[0][i]+"_"+p[1][i] name=name.replace(" ","") name_list.append(name) value_list.append(name) sheet=pd.DataFrame(p[2:], colums=name_list) sheet.loc[sheet["구분"]=="수익(매출액)",["구분"]]="매출액" return sheet, name_list, value_list
def show_statistics(): """ Mostra na tela informações sobre os jogos anteriores """ print('Analisando...') numbers = {} winners = [] prizes = [] # Lendo dados do arquivo HTML... with open('data/d_megasc.htm') as file: tables = extract_tables(file.read()) data = make2d(tables[0])[1:] for line in data: # Soma ocorrência de cada número: for num in range(2, 8): num = line[num] if num not in numbers: numbers[num] = 0 numbers[num] += 1 # Quantidade de ganhadores winners_qty = int(line[9]) winners.append(winners_qty) # Total do prêmio prize_value = line[12].replace('.', '').replace(',', '.') prize_total = Decimal(prize_value) * Decimal(winners_qty) prizes.append(prize_total) # Ordena os números sorteados por ocorrência sorted_numbers = OrderedDict( sorted(numbers.items(), key=lambda x: x[1], reverse=True)) listed_numbers = [n for n in sorted_numbers.keys()] more_frequent_numbers = listed_numbers[:10] less_frequent_numbers = listed_numbers[-10:] less_frequent_numbers.reverse() print('\nConcursos de %s até %s:' % (data[0][1], data[-1][1])) print(' Concursos realizados: %s\n' % format_number(len(data))) print(' Total de ganhadores: %s' % format_number(int(np.sum(winners)))) print(' Média de ganhadores por concurso: %s\n' % format_number(float(np.mean(winners)))) print(' Total em prêmios concedidos: R$ %s' % format_number(int(np.sum(prizes)))) print(' Média de prêmio por concurso: R$ %s\n' % format_number(int(np.mean(prizes)))) print(' Os 10 números mais frequêntes: %s' % ', '.join(more_frequent_numbers)) print(' Os 10 números menos frequêntes: %s' % ', '.join(less_frequent_numbers))
def main(): parser = argparse.ArgumentParser(description='Provide credentials') parser.add_argument('--login', required=True) parser.add_argument('--password', required=True) parser.add_argument('--syndicateId', required=True) args = parser.parse_args() battles = [] session = requests.Session() session.headers.update({'User-Agent': USER_AGENT}) login_page = session.get('https://www.ganjawars.ru/login.php') # Generate POST fields form_items = bs(login_page.text, 'lxml') post_data = { e['name']: e.get('value', '') for e in form_items.find_all('input', {'name': True}) } post_data['login'] = args.login post_data['pass'] = args.password # Login session.post('https://www.ganjawars.ru/login.php', data=post_data) port_page = session.get( 'http://www.ganjawars.ru/object.php?id=69403&page=oncoming1&sid=%s' % args.syndicateId) session.close() # Parse page soup = bs(port_page.text, "html.parser") battles_table = soup.find_all('table')[-1] battles_list = parse.make2d(battles_table) del battles_list[0] for battle in battles_list: del battle[3] syndicate_a, syndicate_b = str(battle[2]).split(' vs ', 1) if args.syndicateId + ' ' in syndicate_a: enemy_syndicate = syndicate_b else: enemy_syndicate = syndicate_a table_row = [battle[0], battle[1], enemy_syndicate] battles.append(table_row) print( tabulate(battles, headers=['ВРЕМЯ', 'ФОРМАТ', 'ПРОТИВНИК'], tablefmt='simple'))
def assignment(crpcode): global df, table url_company = "https://opendart.fss.or.kr/api/list.json?crtfc_key={0}&corp_code={1}&bgn_de=20160101&end_de=20191231&pblntf_ty=A&pblntf_detail_ty=A002&page_no=1&page_count=10" url = url_company.format(apikey, crpcode) response = requests.get(url) output = json.loads(response.content) output_df=json_normalize(output['list']) company_code = output_df[output_df['report_nm']=='사업보고서 (2018.12)']['rcept_no'].iloc[0] url_parser = "https://opendart.fss.or.kr/api/document.xml?crtfc_key={0}&rcept_no="+company_code url = url_parser.format(apikey) webbrowser.open(url) time.sleep(3) #다운로드 시간 고려 os.rename(path_to_download_folder+'/document.xml', path_to_download_folder+'/'+company_code+'.zip') os.chdir(path_to_download_folder) ex_zip=zipfile.ZipFile(company_code+'.zip') ex_zip.extractall() ex_zip.close() soup = BeautifulSoup(open((path_to_download_folder+'/'+company_code+'.xml'), 'rb'), 'html.parser') body=soup.find("body") table=body.find_all('table') for i in range(len(table)): a=pd.DataFrame(parser.make2d(table[i])) if a.iloc[0,0]=='재무상태표': df=pd.DataFrame(parser.make2d(table[i+1])) break df.columns = df.iloc[0] df=df.set_index(df.iloc[:,0]) df=df.drop(df.index[0]) df=df.drop(df.columns[0], axis=1)
def crawling(url): # 셀레니움(selenium)으로 브라우저를 컨트를하기 위해서는 webdriver를 설치해야 한다 # 구글크롬은 chromedriver로 검색하면 exe파일을 다운로드 할 수 있다. # webdriver 설치 위치를 정의한다. print(url) browser = webdriver.Chrome( 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver') # 불러오고자하는 url을 .get으로 호출한다. browser.get(url) # page_source파라미터를 이용하면 HTML정보를 가지고 온다. html = browser.page_source soup = BeautifulSoup(html, 'html.parser') # 뷰티풀숩으로 HTML을 파싱하고 필요한 데이터 수집 table_tags = soup.find_all( "table") # find_all함수를 이용하면, TABLE태그로 지정된 곳만 뽑아서, 배열 형태로 저장 table = table_tags[0] # html상에서 table 순서, 첫번째 테이블 가지고 와야 함. p = parser.make2d(table) df_total = pd.DataFrame(p[1:], columns=p[0]) # 데이터 프레임으로 저장 num = soup.find_all("button")[5].text # 5번째 button에 페이지수 정보가 있음 for j in range(1, int(num)): btn = browser.find_element_by_class_name('paginationWidget-next') btn.click() # 버튼 클릭 time.sleep(5) # 2페이지 이후 데이터 병합 html = browser.page_source soup = BeautifulSoup(html, 'html.parser') table_tags = soup.find_all("table") table = table_tags[0] p = parser.make2d(table) df = pd.DataFrame(p[1:], columns=p[0]) df_total = pd.concat([df_total, df], 0) return df_total
def get_table_gen_from_nice(ticker, attr): # for tick in tick_df: try: url = "http://media.kisline.com/investinfo/mainInvestinfo.nice?paper_stock={}&nav=3".format( ticker) res = req.get(url) bs_data = bs(res.content, "html.parser") table_data = bs_data.find("div", {"id": attr}) yield pd.DataFrame(parser.make2d(table_data)) # , name except: pass
def get_for_rate(nation, to="2019-12-31"): nation_list = { "KOR": "/central-bank-south-korea/bank-of-korea-interest-rate.aspx", "JPN": "/central-bank-japan/boj-interest-rate.aspx", "USA": "/central-bank-america/fed-interest-rate.aspx", "CHI": "/central-bank-china/pbc-interest-rate.aspx", "EUR": "/european-central-bank/ecb-interest-rate.aspx" } url = nation_list.get(nation, "-1") if url == "-1": print("국가명을 확인해주세요.") return -1 url = "https://www.global-rates.com/interest-rates/central-banks{}".format( url) result = urlopen(url).read() soup = BeautifulSoup(result, "html.parser") table_tags = soup.find_all("table") # for idx,t in enumerate(table_tags): # try: # print("{}: ".format(idx),end="") # print(t.find_all("h3")[0].text) # except: # pass table = parser.make2d(table_tags[18]) df = pd.DataFrame(table[2:], columns=table[1]) df.head() df["dt"] = df["change date"].astype("datetime64") df_range = pd.DataFrame({"dt": pd.date_range(df["dt"].min(), to)}) df_1 = pd.merge(df_range, df, how="left") df_1["percentage"] = df_1["percentage"].fillna(method="ffill") df_1["percentage"] = df_1["percentage"].str.replace("\xa0%", "").astype(float) df_1["nation"] = nation df_1 = df_1.drop("change date", 1) df_1 = df_1.rename(columns={"percentage": "base_rate"}) time.sleep(3) return df_1
def test_html_table_parser(self): from bs4 import BeautifulSoup as bs from html_table_parser import parser_functions as parse soup = bs(mock_html_table(), "html.parser") test_table = soup.find('table') twod = parse.make2d(test_table) # two_col_data function is case insensitive self.assertEqual(parse.twod_col_data(twod, 'first name'), ['Eve', 'John', 'Adam', 'Jill']) # last name for first row is Eve because of colspan self.assertEqual(parse.twod_col_data(twod, 'lAst naMe'), ['Eve', 'Doe', 'Johnson', 'Smith']) # points for last row is 67 because of rowspan self.assertEqual(parse.twod_col_data(twod, 'POINTS'), ['94', '80', '67', '67'])
def LNG_data_parsing(): total_df = pd.DataFrame() for i in range(1, 472): res = get_response(i) parsed_html = parser.make2d(bs(res.content, "lxml").find("tbody")) df = pd.DataFrame(parsed_html).iloc[:, :2] df.set_index(0, inplace=True) df.columns = ["종가"] total_df = pd.concat([total_df, df]) total_df.index.name = "date" total_df = total_df.astype(np.float32) total_df.index = total_df.index.map(lambda x: x.replace(".", "-")) total_df.to_csv("NG_naver_price.csv", encoding="cp949") return total_df
def scrape_url(url): # Get the html of the page html = requests.get(url).text # Convert to BeautifulSoup object soup = BeautifulSoup(html, 'html.parser') # Find the table containing the statistics table = soup.find('table', {'border': '1', 'bgcolor': '#aaaaaa'}) # Convert to two-dimensional table to make parsing easier twodim_table = parse.make2d(table) df = pd.DataFrame(twodim_table) # Set the column names and get rid of reprinted header rows throughout table df.columns = df.ix[0, :] df = df[df['#'] != '#'] # Select only the players name, games played, and time on ice df = df[['Player Name', 'GP', 'TOI']] return df
def Search(): url = 'http://www.38.co.kr/html/fund/index.htm?o=k' data = urlopen(url).read() soup = BeautifulSoup(data, 'html.parser') table = soup.find("table", {'summary': '공모주 청약일정'}) html_table = parser.make2d(table) df = pd.DataFrame(html_table[2:], columns=html_table[0]) df['일정'] = df['공모주일정'].str[:10] df = df[df['일정'] >= today].sort_values(by='일정', ascending=True) df = df[['종목명', '일정', '공모주일정', '희망공모가', '주간사']].reset_index(drop=True) stock_new = list(df['종목명'].values) date_new = list(df['일정'].values) price_new = list(df['희망공모가'].values) company_new = list(df['주간사'].values) print(df) stock_old, date_old = Check() num = 1 msg = [] new_msg = False for date, stock, price, company in zip(date_new, stock_new, price_new, company_new): if (date not in date_old) and (stock not in stock_old): new_msg = True msg.append( str(num) + '. ' + stock + ' / ' + date + ' / ' + price + ' / ' + company + '\n\n') num += 1 if new_msg == True: msg.append(url) msg_final = "".join( msg) # 리스트 내부 인자들을 전부 합치기. ""안에 기호를 넣으면 기호 포함되어 합쳐짐 bot.sendMessage(12345678, "<신규 공모주 청약 일정>" + "\n" + msg_final) # DB 저장 DatatoSQL(df) else: print('신규 일정 없음')
def crawl2(c_code): url = "http://comp.fnguide.com/SVO2/asp/SVD_Finance.asp?pGB=1&gicode=A" + str( c_code) + "&cID=&MenuYn=Y&ReportGB=&NewMenuID=103&stkGb=701" html = urlopen(url) bsObj = BeautifulSoup(html, "html.parser") tables = bsObj.find_all("table", attrs={"class": "us_table_ty1 h_fix zigbg_no"}) if len(tables) < 3: return None table = tables[2] html_table = parser.make2d(table) flag = True cnt = 0 for row in html_table: if "이익잉여금" in row[0]: flag = False ri = cnt - 1 # null 처리 for i in range(len(row)): if row[i] == '': row[i] = None cnt += 1 # 이익잉여금 정보 없으면 return if flag: return None df = pd.DataFrame(data=html_table[1:], index=range(0, len(html_table) - 1), columns=html_table[0]) d = df.columns[len(df.columns) - 1] if df[d].iloc[ri] == None: return None # float 형식으로 변환 df[d].iloc[ri] = str(df[d].iloc[ri]) if ',' in str(df[d].iloc[ri]): df[d].iloc[ri] = df[d].iloc[ri].replace(',', '') return float(df[d].iloc[ri])
def get_trail_data(): try: #s = urllib.request.urlopen("http://trianglemtb.com/trailstatus.php",None,5).read() s = urllib.request.urlopen("http://trianglemtb.com/trailstatus.php", None, 5).read() table = parse.make2d(bs(s, "html.parser")) return table, None except timeout: logging.error("error: socket timed out") except BaseException as error: print("error: url fetch: ", error) card_title = "Service Error" speech_output = "I'm sorry, there was an error accessing the triangle m.t.b. trail status page." should_end_session = True return None, build_response({}, build_speechlet_response( card_title, speech_output, None, should_end_session))
def HTMLParse(): # HTML 파싱 url = "http://222.233.168.6:8094/RoomStatus.aspx" # 도서관 자리 홈페이지 result = urlopen(url) html = result.read() soup = BeautifulSoup(html, 'html.parser') # BS를 이용해 HTML파싱 temp = soup.find_all('table') # 데이터프레임 생성 p = parser.make2d(temp[1]) # 데이터 프레임 생성 df = pd.DataFrame( p[1:], columns=['a', 'b', 'Use', 'Avail', 'Rate', 'Wait', 'f', 'g'], index=['Man', 'Women', 'Adult', 'Adult2', 'Free', 'Notebook', 'Sum']) # 데이터프레임의 행렬 정의 del df['a'] del df['b'] # 쓸모없는 부분 삭제 # print("parse complete") return df
def naver_fs_crawler(self): for iter_ in self.corp_sym_name_df.iterrows(): print(iter_) corp_code, corp_name = iter_[1]["Symbol"], iter_[1]["Name"] url = "http://companyinfo.stock.naver.com/v1/company/cF1001.aspx?cmp_cd={}&fin_typ=0&freq_typ=Y".format( corp_code) res = req.get(url) html_ = bs(res.content, "lxml") table_1 = html_.table table_2 = parser.make2d(table_1) bs_data = pd.DataFrame(table_2).T bs_data = bs_data.set_index(1).T.set_index("주요재무정보") bs_data.index.name = corp_name, corp_code yield bs_data.iloc[:, 2:8] # 제너레이터 형태로 반환
def open_dart(code, number): ttr = {} for index, it in enumerate(company_code(code, number)): url = 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo=' + it html = requests.get(url).text title = bs(requests.get(url).text, 'html.parser').find('title') split = re.split('연결재무제표', html)[1].split(r');')[0].split(r'viewDoc(')[1].replace("'", "").split(', ') rurl = f'http://dart.fss.or.kr/report/viewer.do?rcpNo={split[0]}&dcmNo={split[1]}&eleId={split[2]}&offset={split[3]}&length={split[4]}&dtd={split[5]}' print(rurl) result = bs(requests.get(rurl).text, 'html.parser') re_title = result.select('html > body > table')[2].select_one('tbody > tr > td').findChild().text rt = str(re_title).replace(' ', '') print('re_title ==== ', rt) if rt == '연결손익계산서' or rt == '연결포괄손익계산서': tbody = str(result).split(re_title)[1] body = bs(tbody, 'html.parser') tr = body.find('table') table = parser.make2d(tr) ttr['a' + str(index)] = title.text.split('/')[0].replace('\n', '') + '_' + number + '분기' ttr['b' + str(index)] = data_set(table) return ttr
def fetch_html(): html = "" page = requests.get(URL) html = page.content return html if __name__ == '__main__': events = [] soup = bs(fetch_html(), "html.parser") event_table = soup.find_all('table')[1] # using text_only false because we want soup cells in order to reference href attr twod = parse.make2d(event_table, text_only=False) for row in twod[2:]: # using event_dict to explicitly set custom column keys events.append(event_dict(*row)) pp.pprint(events)
def crawl(j_code, c_name): url = "http://comp.fnguide.com/SVO2/asp/SVD_Main.asp?pGB=1&gicode=A" + str(j_code) +"&cID=&MenuYn=Y&ReportGB=&NewMenuID=101" html = urlopen(url) bsObj = BeautifulSoup(html, "html.parser") # 웹사이트에서 필요한 부분을 가져오는 코드 div = bsObj.find_all("div", attrs={"id":"div15"}) # 재무 정보를 제공을 안한 경우 --> [제외] if len(div) < 1: return "no_info" no_data = bsObj.find_all("div", attrs={"id":"divNotData"}) if len(no_data) > 0: no_data = no_data[0].find_all("div", attrs={"class":"um_notdata"}) if len(no_data) > 0: if "재무정보를 제공하지 않습니다." in no_data[0]: return "no_info" div = div[0] # 연결 - 전체 데이터 table = div.find_all("table", attrs={"class":"us_table_ty1 h_fix zigbg_no"})[0] html_table = parser.make2d(table) # 예외 처리 if (len(html_table[0]) != len(html_table[1])): if len(html_table[0]) > len(html_table[1]): n = len(html_table[0]) - len(html_table[1]) html_table[0] = html_table[0][:-1*n] else: n = len(html_table[1]) - len(html_table[0]) for i in range(n): html_table[0].append('Net Quarter') # 디버깅용 print ^_^ print(j_code, c_name) df = pd.DataFrame(data=html_table[1:], index=range(0, len(html_table)-1), columns=html_table[0]) del df['Net Quarter'] dfl = df.values.tolist() # 예외 처리 if len(dfl) < 1: return "no_info" for i, date in enumerate(dfl[0]): if "(E)" in date: dfl[0][i] = date[26:] if "(E)" in dfl[0][i]: dfl[0][i] = dfl[0][i][:-3] if "(P)" in date: dfl[0][i] = date[24:] if "(P)" in dfl[0][i]: dfl[0][i] = dfl[0][i][:-3] # null 처리 for l in dfl[1:]: for i in range(len(l)): if l[i] == '': l[i] = None df = pd.DataFrame(data=dfl[1:], index=range(0, len(dfl)-1), columns=dfl[0]) df.name = c_name # 예외 처리: 데이터가 없는 경우.. if len(dfl[0]) < 2: return "no_info" if len(dfl) < 2: return "no_info" return df
t.rpt_nm.string,t.rcp_no.string,t.flr_nm.string, t.rcp_dt.string, t.rmk.string]]), columns=["crp_cls","crp_nm","crp_cd","rpt_nm","rcp_no","flr_nm", "rcp_dt","rmk"]) data = pd.concat([data,temp]) data = data.reset_index(drop=True) user_num=int(input("몇 번째 보고서를 확인하시겠습니까?")) url = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo="+data["rcp_no"][user_num] req=requests.get(url).text tree=lxml.html.fromstring(req) onclick=tree.xpath('//*[@id="north"]/div[2]/ul/li[1]/a')[0].attrib['onclick'] pattern=re.compile("^openPdfDownload\('\d+',\s*'(\d+)'\)") dcm_no=pattern.search(onclick).group(1) url_parsing="http://dart.fss.or.kr/report/viewer.do?rcpNo="+data['rcp_no'][user_num]+"&dcmNo="+dcm_no+"&eleId=15&offset=1489233&length=105206&dtd=dart3.xsd" report=urlopen(url_parsing) r=report.read() xmlsoup_another=bs(r,'html.parser') body = xmlsoup_another.find("body") table=body.find_all("table") p=parser.make2d(table[3])
resultset = [[]] with codecs.open('port_list.html', 'r', encoding='cp1251', errors='ignore') as fd: for line in fd: line = line.lstrip() line = line.replace(" ", " ") doc += line.replace("\n", " ") soup = bs(doc, "html.parser") test_table = soup.find('table', { 'cellspacing': '1', 'cellpadding': '5', 'width': '100%' }) twod_array = parse.make2d(test_table) twod_array[0] = ['Время', 'Формат', 'Участники', 'Контроль'] for battle in twod_array: i = 0 del battle[3] if battle[2] != 'Участники': sind1, sind2 = str(battle[2]).split(' vs ', 1) sind_1 = sind1.split(' ', 1) sind_2 = sind2.split(' ', 1) tlist = [ battle[0], battle[1], sind_1[0].replace('#', ''), sind_1[1], sind_2[0].replace('#', ''), sind_2[1] ] resultset.append(tlist) i += i
def get_fss(rcp_no): url1 = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo={}" retry_cnt = 0 while retry_cnt < 20: try: page = None try: page = urlopen(url1.format(rcp_no)) except: retry_cnt += 1 continue html = page.read().decode('utf-8') # viewDoc('11111', '22222', 함수를 검색해서 dcm_no 값을 추출한다. result = re.search(r'viewDoc\(\'(.*)\', \'(.*)\',', html) dcm_no = result.group(2) url2 = "http://dart.fss.or.kr/report/viewer.do?rcpNo={}&dcmNo={}&eleId=15&offset=297450&length=378975&dtd=dart3.xsd".format( rcp_no, dcm_no) page = None try: page = urlopen(url2) except: retry_cnt += 1 continue r = page.read() xmlsoup = BeautifulSoup(r, 'html.parser') body = xmlsoup.find("body") tables = body.find_all("table") head = parser.make2d(tables[0]) dates = [] for grp in range(1, len(head) - 1): dstr = re.search(r'(\d+.\d+.\d+)', head[grp][0]) date = datetime.datetime.strptime(dstr.group(1), "%Y.%m.%d").date() dates.append(date) if len(dates) == 0: return info_tbl = {} totals = parser.make2d(tables[1]) # 자산 for idx in range(0, len(totals)): list = [] for grp in range(1, len(dates) + 1): list.append(strip_money(totals[idx][grp])) info_tbl[totals[idx][0].strip()] = list strip_money(totals[idx][2]) # 당기순이익 profits = parser.make2d(tables[3]) for idx in range(0, len(profits)): list = [] for grp in range(0, len(dates)): list.append(strip_money(profits[idx][1 + grp * 2])) info_tbl[profits[idx][0].strip()] = list return dates, info_tbl except: #print("error:", sys.exc_info()[0]) pass return
def get_div_data(browser, last_num, file_nm): search_btn = browser.find_element_by_id("image1") search_btn.click() # html소스를 가져와서, 원하는 위치를 찾습니다. html = browser.page_source from bs4 import BeautifulSoup from html_table_parser import parser_functions as parser import pandas as pd soup = BeautifulSoup(html, "html.parser") table = soup.find("table", attrs={"id": "grid1_body_table"}) p = parser.make2d(table) df = pd.DataFrame(p[2:], columns=p[1]) df.head() import time import random from tqdm import tqdm prev_no = 0 prev_table = None for i in tqdm(range(0, 200)): try: next_btn = browser.find_element_by_id("cntsPaging01_next_btn") next_btn.click() except: time.sleep(2) try: next_btn = browser.find_element_by_id("cntsPaging01_next_btn") next_btn.click() except: time.sleep(2) next_btn = browser.find_element_by_id("cntsPaging01_next_btn") next_btn.click() def get_html(browser, cnt): if cnt >= 4: return -1, -1 html = browser.page_source soup = BeautifulSoup(html, 'html.parser') cur_no = soup.find( "a", attrs={ "class": "w2pageList_control_label w2pageList_label_selected" }) cur_no = cur_no.text table = soup.find("table", attrs={"id": "grid1_body_table"}) if cur_no != prev_no and prev_table != table: return cur_no, table else: time.sleep(1) get_html(browser, cnt + 1) cur_no, table = get_html(browser, 1) if cur_no == -1: print("\n종료. 테이블 정보가 바뀌지 않았습니다.") break p = parser.make2d(table) temp = pd.DataFrame(p[2:], columns=p[1]) df = pd.concat([df, temp], 0) prev_no = cur_no prev_table = html if cur_no == str(last_num): print("\n최종 페이지 도달") break time.sleep(random.randrange(3, 5)) df.to_pickle(file_nm)
# latestsnelist=pd.read_table('recentsnelist.txt') # latestsnelist=pd.read_table('recentlist.txt') #,names=colnames,data_start=1,guess='False') # latestsnelist=ascii.read('recentsnelist.txt',delimiter='\t') #,names=colnames,data_start=1,guess='False') imsnglist = ascii.read('/data7/cschoi/IMSNG/target/alltarget.dat') urlall = "http://www.RochesterAstronomy.org/snimages/sndateall.html" # sn date all url = 'http://www.rochesterastronomy.org/snimages/sndate.html' # sndate print('getting table data from web page from', url) response = requests.get(url) print('Done, table data is obtained') soup = BeautifulSoup(response.content, 'html.parser') tbl = soup.find_all('table') soup.find_all('table')[1].find_all('th') html_table = parser.make2d(tbl[1]) df = pd.DataFrame(html_table[1:], columns=html_table[0]) latestsnelist = df print('getting table data from web page from', urlall) responseall = requests.get(urlall) print('Done, table data is obtained') soupall = BeautifulSoup(responseall.content, 'html.parser') tblall = soupall.find_all('table') soupall.find_all('table')[1].find_all('th') html_tableall = parser.make2d(tblall[1]) dfall = pd.DataFrame(html_tableall[1:], columns=html_tableall[0]) latestsnelistall = dfall
def fs_table(): # 검색된 전체 사업보고서 기반 재무제표 추출 함수 data = searching_report() document_count = 0 for i in range(len(data)): MAIN_URL = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo=" + data[ 'rcp_no'][document_count] print(MAIN_URL) page = BeautifulSoup(urlopen(MAIN_URL).read(), 'html.parser') body = str(page.find('head')) if len(body.split('연결재무제표",')) <= 1: # 연결재무제표 & 재무제표 탐색 시작 if len(body.split('연 결 재 무 제 표",')) >= 2: body = body.split('연 결 재 무 제 표",')[1] # "연 결 재 무 제 표" 로 발견 print_page_1 = '연 결 재 무 제 표' else: if len(body.split('재무제표",')) <= 1: # 연결재무제표가 없다면 재무제표 로 탐색 시작 if len(body.split('재 무 제 표",')) >= 2: body = body.split('재 무 제 표",')[1] # "재 무 제 표" 로 발견 print_page_1 = '재 무 제 표' else: print("### Failed. (연결재무제표/재무제표 페이지 탐색 실패.)") return 0 # 아무것도 발견하지 못할 시 프로그램 종료. else: body = body.split('재무제표",')[1] # "재무제표" 로 발견 print_page_1 = '재무제표' else: body = body.split('연결재무제표",')[1] # "연결재무제표" 로 발견 print_page_1 = '연결재무제표' body = body.split('cnt++')[0].split('viewDoc(')[1].split(')')[0].split( ', ') body = [body[i][1:-1] for i in range(len(body))] # 찾아낸 재무제표 페이지로 이동하기 위한 url구성 번호 파싱 VIEWER_URL = "http://dart.fss.or.kr/report/viewer.do?rcpNo=" + body[0] \ + '&dcmNo=' + body[1] + '&eleId=' + body[2] + '&offset=' + body[3] \ + '&length=' + body[4] + '&dtd=dart3.xsd' print(VIEWER_URL) page = BeautifulSoup(urlopen(VIEWER_URL).read(), 'html.parser') if len(str(page.find('body')).split('재 무 상 태 표')) == 1: # 재무상태표 탐색 시작 if len(str(page.find('body')).split( '재무상태표')) <= 1: # 재무상태표를 찾아내지 못한다면 프로그램 종료 print("### Failed. (재무상태표 탐색 실패.)") return 0 else: body = str(page.find('body')).split('재무상태표')[1] # "재무상태표" 로 발견 print_page_2 = '재무상태표' else: body = str( page.find('body')).split('재 무 상 태 표')[1] # "재 무 상 태 표" 로 발견 print_page_2 = '재 무 상 태 표' body = BeautifulSoup(body, 'html.parser') # 찾아낸 재무상태표를 읽어내기 위해 파싱 print(print_page_1 + " - " + print_page_2) print(body.find(align='RIGHT').text) table = body.find_all('table') # table 태그 탐색 if len(table) <= 1: # 탐색 실패시 프로그램 종료 print("### Failed. (there's no table.)") return 0 p = parser.make2d(table[0]) table = pd.DataFrame(p[1:], columns=p[0]) table = table.set_index(p[0][0]) table.to_csv('C:\\Users\\admin\\Desktop\\Test_Result\\' + print_page_1 + "_" + print_page_2 + '_' + str(document_count) + '.csv', encoding='cp949') document_count += 1 return table
def fnc_table(rcp): rcp_no = rcp url = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo=" + rcp_no page = BeautifulSoup(urlopen(url).read(), 'html.parser', from_encoding='utf-8') body = str(page.find('head')) body = body.split('【 본 문 】",')[1] body = body.split('cnt++')[0] body = body.split('viewDoc(')[1] body = body.split(')')[0] body = body.split(', ') body = [body[i][1:-1] for i in range(len(body))] url_final = 'http://dart.fss.or.kr/report/viewer.do?rcpNo=' + body[ 0] + '&dcmNo=' + body[1] + '&eleId=' + body[2] + '&offset=' + body[ 3] + '&length=' + body[4] + '&dtd=dart3.xsd' #print(url_final) data = pd.DataFrame() page = BeautifulSoup(urlopen(url_final).read(), 'html.parser') body = str(page).split('(2) 모집 또는 매출의 개요')[1] body = BeautifulSoup(body, 'html.parser') table1 = body.find_all("table") p = parser.make2d(table1[0]) table1 = pd.DataFrame(p[0:], columns=["content", "content1", "내용"]) table1 = table1.convert_dtypes() table1.content = table1.content.str.replace('\s+', '') #print(table1) table1['bool'] = table1.iloc[:, 0].apply( lambda x: '종목명' in x or '기초자산' in x or '발행일' in x or '만기일' in x) table1 = table1[table1['bool'] == True] table1 = table1.reset_index(drop=True) #print(table1) pdct_nm = table1.loc[0, "내용"] pdct_nm = re.sub(r'\([^)]*\)', '', pdct_nm) pdct_asset = table1.loc[1, "내용"] st_date = table1.loc[2, "content1"] exp_date = table1.loc[3, "content1"] exp_date = re.sub(r'\([^)]*\)', '', exp_date) temp = pd.DataFrame(([[rcp_no, pdct_nm, pdct_asset, st_date, exp_date]]), columns=["문서번호", "상품명", "기초자산", "발행일", "만기일"]) data = pd.concat([data, temp]) try: body = str(page).split('최대이익액 및')[1] except: try: body = str(page).split('최소이익액 및')[1] except: body = str(page).split('최대손실액 및')[1] body = BeautifulSoup(body, 'html.parser') table2 = body.find_all("table") p = parser.make2d(table2[0]) table2 = pd.DataFrame(p[0:], columns=["구분", "내용", "수익률"]) table2['bool'] = table2.iloc[:, 0].apply( lambda x: '최대손실액' in x or '최대이익액' in x or '최소이익액' in x) table2 = table2[table2['bool'] == True] loss_max = table2['구분'] == '최대손실액' earn_min = table2['구분'] == '최소이익액' min = table2[loss_max | earn_min] min = min.reset_index(drop=True) min_rate = min.loc[0, "수익률"] temp_max = table2['구분'] == '최대이익액' max = table2[temp_max] max = max.reset_index(drop=True) max_rate = max.loc[0, "수익률"] earn_temp = pd.DataFrame(([[min_rate, max_rate]]), columns=["최소수익", "최대수익"]) data = pd.concat([data, earn_temp], axis=1) #print(min_rate) return data
for i in range(0, len(report_df)): crp_cd, crp_cls, crp_nm, rcp_dt, rcp_dt, rcp_no, rpt_nm, period = make_object( report_df, i) soup = fs.financial_statements_soup(rcp_no, name='임원 및 직원의 현황') ob_, ob_unit = financial_statements_data(soup, name='.*임.*원.*현.*황.*') for k in range(0, len(soup.find_all('table'))): find = str(soup.find_all('table')[k]) if re.search('의결권', find): ob_soup = soup.find_all('table')[k] ob_soup = BeautifulSoup(str(ob_soup), 'lxml') elif re.search('직 원 수', find): yb_soup = soup.find_all('table')[k] yb_soup = BeautifulSoup(str(yb_soup), 'lxml') p = parser.make2d(yb_soup) emp = pd.DataFrame(p, columns=p[0]) emp.columns = emp.loc[1] sex_ratio(emp, '1') earning(emp, '2') years(emp, '3') count_emp(emp, '4') p = parser.make2d(ob_soup) emp = pd.DataFrame(p, columns=p[0]) emp.columns = emp.loc[1] ob_ages(emp, '5') msg = MIMEMultipart('related') msg['Subject'] = Header('[test]' + crp_nm + ' 사업보고서 임직원 현황 시각화 테스트중', 'utf-8') msg['From'] = formataddr((str(Header(u'M.Robo', 'utf-8')), '*****@*****.**'))
from bs4 import BeautifulSoup as bs from html_table_parser import parser_functions as parse from html_table_parser.tests import test_html_table_parser as test import pprint pp = pprint.PrettyPrinter(indent=4, width=120) __author__ = 'oswaldjones' if __name__ == '__main__': soup = bs(test.mock_html_table(), "html.parser") test_table = soup.find('table') twod_array = parse.make2d(test_table) # print 2D array pp.pprint(twod_array) # print column data by col heading name (case insensitive) pp.pprint(parse.twod_col_data(twod_array, 'first name')) pp.pprint(parse.twod_col_data(twod_array, 'lAst naMe')) # row data begins on first row after col headings # so rowstart is 1 pp.pprint(parse.make_dict(test_table, 1))
def crawl_basics(self): text = get_html("https://pokemon.fandom.com/ko/wiki/파르셀_(포켓몬)") soup = BeautifulSoup(text, features="html.parser") body = [] body_table = [[], []] for div in soup("div"): if "class" in div.attrs: # 포켓몬의 이름 if "name-ko" in div["class"]: self.pokemon.Basic.name = div.text.strip() # 도감 번호 if "index" in div["class"]: self.pokemon.Basic.number = div.text[3:] # No. 제거 # 기본 정보가 담긴 표(body_table) 추출 tables = soup.find_all("table") for table in tables: if "class" in table.attrs: if "body" in table.attrs["class"]: body = parser_functions.make2d(table) print(body) type_i = 0 j = 1 for item in body: if type_i % 2 == j % 2: for values in item: if "도감 번호" in values: body_table[0].append(values) j += 0.5 else: body_table[1].append(values) else: for columns in item: body_table[0].append(columns) type_i += 1 print(body_table) # 표에서 정보 추출 for key in body_table[0]: index = body_table[0].index(key) if key == "타입": type_i = 0 while True: type_i += 1 if body_table[1][index][0:type_i] in TYPES: self.pokemon.Basic.types.append( body_table[1][index][0:type_i]) self.pokemon.Basic.types.append( body_table[1][index][type_i:]) break elif key == "분류": self.pokemon.Basic.species = body_table[1][index] elif key == "특성": abilities_i = 0 while True: abilities_i += 1 if body_table[1][index][0:abilities_i] in ABILITIES: self.pokemon.Battle.ord_abilities.append( body_table[1][index][0:abilities_i]) self.pokemon.Battle.ord_abilities.append( body_table[1][index][abilities_i:]) break print(self.pokemon.Basic.types) print(self.pokemon.Battle.ord_abilities)
aodkor = AODlist.kor[k] for i in range(15) : airport = my_dict_ARP.loc[i,'url_airport'] url = base_url + '&depArr=' + depArr + '¤t_date=' + date + '&airport=' + airport + '&al_icao=&fp_id=' driver = webdriver.Chrome('C:/Users/Danah/Documents/chromedriver_win32/chromedriver.exe') driver.implicitly_wait(5) driver.get(url) html = driver.page_source soup = bs(html, 'html.parser') temp = soup.find_all('table') driver.close() p = parser.make2d(temp[1]) df = pd.DataFrame(p[0:],columns=['FLO_kor', '', 'FLT', '', 'ODP_kor', '', 'STT', '', 'Expected Time', '', 'ATT', '', 'Type', '', 'DLY']) df = df.iloc[::2,::2] df = df.drop(['Expected Time', 'Type', 'ATT'], axis=1) col = list(df.columns) df.insert(col.index('ODP_kor'), 'ODP', df['ODP_kor'].map(my_dict_ARP.set_index('kor')['eng'])) df = df.drop('ODP_kor', axis=1) df.insert(col.index('FLO_kor'), 'FLO', df['FLO_kor'].map(my_dict_FLO.set_index('kor')['eng'])) df = df.drop('FLO_kor', axis=1) df.DLY = list([0 if df['DLY'].iloc[i] == aodkor else 1 if df['DLY'].iloc[i] == '지연' else np.nan for i in range(0, len(df))]) df = df.dropna(axis=0) df['ARP'] = np.repeat(my_dict_ARP.loc[i,'eng'], df.shape[0]) df['SDT_YY'] = np.repeat(2019, df.shape[0]) df['SDT_MM'] = np.repeat(9, df.shape[0])