Python make2d Examples, html_table_parser.parser_functions.make2d Python Examples

Example #1

0

Show file

    def nice_fs_crawler(self, bs_type):

        for iter_ in self.corp_sym_name_df.iterrows():
            idx, corp_code, corp_name = iter_[0], iter_[1]["Symbol"], iter_[1][
                "Name"]

            url = "http://media.kisline.com/highlight/mainHighlight.nice?paper_stock={}&nav=1".format(
                corp_code)

            res = req.get(url)
            html_ = bs(res.content, "lxml")

            table_1 = html_.findAll("table")

            if len(table_1) <= 13:
                print("{}에서 문제가 발생했습니다.".format(idx))

            else:
                if bs_type == "개별":
                    table_2_annual = pd.DataFrame(parser.make2d(table_1[5]))
                    table_2_quater = pd.DataFrame(parser.make2d(table_1[6]))

                else:
                    table_2_annual = pd.DataFrame(parser.make2d(table_1[7]))
                    table_2_quater = pd.DataFrame(parser.make2d(table_1[8]))

                bs_data = pd.concat([table_2_annual, table_2_quater], 1)
                bs_data.columns = bs_data.iloc[1, :]
                bs_data = bs_data.set_index("구분")
                bs_data.index.name = corp_name, corp_code
                bs_data.columns.name = ""
                bs_data = bs_data.drop(["", "구분"])

                yield bs_data  # 제너레이터 형태로 반환

Example #2

0

Show file

File: [기업 직원보수 및 근무 현황(사업보고서기준)].py Project: gj4241/gongsi-Bot

def financial_statements_data(soup, name):
    findis = re.compile(name)
    isstyle = re.compile('.*font-weight:.*;')
    wonstyle = re.compile('.*단위.*:.*원.*')
    try:
        findsoup = soup.find('p', attrs={
            'style': isstyle
        }, text=findis).find_all_next()
    except:
        try:
            findsoup = soup.find('span', attrs={
                'style': isstyle
            }, text=findis).find_all_next()
        except:
            try:
                findsoup = soup.find('td', text=findis).find_all_next()
            except:
                findsoup = soup.find('p', text=findis).find_all_next()
    soupis = BeautifulSoup(str(findsoup), 'lxml')
    wonstyle = re.compile('.*단위.*:.*원.*')
    findwon = soupis.find(text=wonstyle)
    unit = re.sub('[^가-힣]', '', findwon)
    unit = re.sub('단위', '', unit)

    temp = soupis.find('table', attrs={'border': '1'})
    p = parser.make2d(temp)
    fs_data = pd.DataFrame(p, columns=p[0])

    return fs_data, unit

Example #3

0

Show file

File: weather_scraper.py Project: JesseBickford/Predicting_Uber_Demand

def get_weather(url, df, date):
    # Open the page and read the html
    html = urllib.urlopen(url).read()
    # Create a Beautiful Soup object of the html
    soup = BeautifulSoup(html, 'html.parser')
    # Find the table html containing the weather data
    table = soup.find('table',{'cellspacing':0, 'cellpadding':0,
                               'id':'obsTable', 'class':'obs-table responsive'})
    # Use html_table_parser to convert this data to a two-dimensional list
    twodim_table = parse.make2d(table)
    # Delete the first list (this is the columns header)
    del twodim_table[0]
    # Convert our two-dimensional list to a DataFrame
    day_df = pd.DataFrame(twodim_table)
    # Some days don't report wind chill, which shifts all the other columns over
    # by one position. If wind chill is reported, we want to drop that column so
    # all dates have uniform columns.
    if len(day_df.columns) == 13:
        day_df.drop(2,axis=1,inplace=True)
        # Reset the column names
        day_df.columns = range(12)
    # Add a column to identify which day this weather data is for
    day_df['date'] = date
    # Concatenate this day's DataFrame to the bottom of the DataFrame that
    # contains all weather data scraped so far
    concat_df = pd.concat([df, day_df], ignore_index=True)
    # Return our concatenated DataFrame
    return concat_df

Example #4

0

Show file

File: stocks_scraper.py Project: JesseBickford/Predicting_Uber_Demand

def scrape_stocks(url):
    # Cleans up a string and returns a list of the stock's values for a day
    def sub(s):
        s = re.sub(',','',s)
        s = re.sub('\n',',',s)
        return s.split(',')
    # Read in the html
    html = urllib.urlopen(url).read()
    # Create a BeautifulSoup object for this html
    soup = BeautifulSoup(html, 'html.parser')
    # Select the table which contains the stock data
    table = soup.find('table',{'class':'gf-table historical_price'})
    # Select the second row in this table
    # First row in the data is the headers which we don't need
    # Due to a quirk in the website, every day's stock price is contained
    # in the second row.
    row = table.find('tr').findNext('tr')
    # Convert the row html into a two-dimensional table
    month_tbl = parse.make2d(row)
    # The individual days are separated by '\n\n' in a single string
    # We split based on this to make each day have it's own list index
    month_tbl = month_tbl[0][0].split('\n\n')
    # Remove the commas, then substitute each \n with a comma that we split on
    month_tbl = [sub(day) for day in month_tbl]

    # Create a DataFrame from the month's stock data
    df = pd.DataFrame(month_tbl,columns=cols)
    # The last day of the month is on top, so we reverse the order of the rows
    df = df.reindex(index=df.index[::-1])
    # Return this DataFrame
    return df

Example #5

0

Show file

File: naver_today_open.py Project: yujin-dev/Stockmarket-Data

def get_todayOpen(stock_code):

    """
    :param stock_code: A 제외 종목 코드( ex. 005930 )
    :return: 오늘의 시가( int )
    """

    def get_response():
        url = "https://finance.naver.com/item/sise_day.nhn?code={}&page=1".format(
            stock_code
        )
        res = req.get(url)
        return res

    today = datetime.today().strftime("%Y.%m.%d")

    res = get_response()
    try:
        today_open = list(
            filter(
                lambda x: today in x,
                parser.make2d(bs(res.content, "lxml").find("table")),
            )
        ).pop()[3]
        today_open = int(today_open.replace(",", ""))

        print(stock_code, today, today_open)  ##  종목코드, 날짜, 시가 체크

        return today_open

    except:
        print(stock_code, today + " No Open Price")  ## 오늘 시가( 가격 데이터 ) 없음

Example #6

0

Show file

def find_table(url2,rcpno):
    temp=urlopen(url2)
    r=temp.read()
    xmlsoup=BeautifulSoup(r,'html.parser')
    temp=xmlsoup.find_all("script",attrs={"type":"text/javascript"})
    txt=temp[7]
    a=txt.text

    b=str.find(a,"4. 재무제표")
    c=a[b:b+200]
    d=c.split(",")[4]
    e=d.replace("\"","")
    e=e.replace("\'","")
    dcmo=int(e)



# 아래 함수로 변경 line85-
# url2 = make_report(company_code)
# print(url2)
# print("here")
# #매출액
# report=urlopen(url2)
# r=report.read()
# xmlsoup=BeautifulSoup(r,'html.parser')
# body=xmlsoup.find("body")
# table=body.find_all("table")
# p = parser.make2d(table[3])

    # 매출 정보 등을 가져오기
    url3="http://dart.fss.or.kr/report/viewer.do?rcpNo="+rcpno+"&dcmNo="+str(dcmo)+"&eleId=15&offset=297450&length=378975&dtd=dart3.xsd"

    # http://dart.fss.or.kr/report/viewer.do?rcpNo=20170811001153&dcmNo=5746981&eleId=15

    report=urlopen(url3)
    r=report.read()
    xmlsoup=BeautifulSoup(r,'html.parser')
    body=xmlsoup.find("body")
    table=body.find_all("table")
    p = parser.make2d(table[3])

    name_list=list()
    value_list=list()

    name_list.append("구분")

    for i in range(1,len(p[0])):
        name=p[0][i]+"_"+p[1][i]
        name=name.replace(" ","")
        name_list.append(name)
        value_list.append(name)

    sheet=pd.DataFrame(p[2:], colums=name_list)
    sheet.loc[sheet["구분"]=="수익(매출액)",["구분"]]="매출액"
    return sheet, name_list, value_list

Example #7

0

Show file

def show_statistics():
    """ Mostra na tela informações sobre os jogos anteriores """
    print('Analisando...')

    numbers = {}
    winners = []
    prizes = []

    # Lendo dados do arquivo HTML...
    with open('data/d_megasc.htm') as file:
        tables = extract_tables(file.read())
        data = make2d(tables[0])[1:]
        for line in data:
            # Soma ocorrência de cada número:
            for num in range(2, 8):
                num = line[num]
                if num not in numbers:
                    numbers[num] = 0
                numbers[num] += 1

            # Quantidade de ganhadores
            winners_qty = int(line[9])
            winners.append(winners_qty)

            # Total do prêmio
            prize_value = line[12].replace('.', '').replace(',', '.')
            prize_total = Decimal(prize_value) * Decimal(winners_qty)
            prizes.append(prize_total)

    # Ordena os números sorteados por ocorrência
    sorted_numbers = OrderedDict(
        sorted(numbers.items(), key=lambda x: x[1], reverse=True))
    listed_numbers = [n for n in sorted_numbers.keys()]
    more_frequent_numbers = listed_numbers[:10]
    less_frequent_numbers = listed_numbers[-10:]
    less_frequent_numbers.reverse()

    print('\nConcursos de %s até %s:' % (data[0][1], data[-1][1]))
    print('    Concursos realizados: %s\n' % format_number(len(data)))

    print('    Total de ganhadores: %s' % format_number(int(np.sum(winners))))
    print('    Média de ganhadores por concurso: %s\n' %
          format_number(float(np.mean(winners))))

    print('    Total em prêmios concedidos: R$ %s' %
          format_number(int(np.sum(prizes))))
    print('    Média de prêmio por concurso: R$ %s\n' %
          format_number(int(np.mean(prizes))))

    print('    Os 10 números mais frequêntes: %s' %
          ', '.join(more_frequent_numbers))
    print('    Os 10 números menos frequêntes: %s' %
          ', '.join(less_frequent_numbers))

Example #8

0

Show file

File: port_battles.py Project: andrunah/gw-scraper

def main():
    parser = argparse.ArgumentParser(description='Provide credentials')
    parser.add_argument('--login', required=True)
    parser.add_argument('--password', required=True)
    parser.add_argument('--syndicateId', required=True)
    args = parser.parse_args()

    battles = []

    session = requests.Session()
    session.headers.update({'User-Agent': USER_AGENT})
    login_page = session.get('https://www.ganjawars.ru/login.php')

    # Generate POST fields
    form_items = bs(login_page.text, 'lxml')
    post_data = {
        e['name']: e.get('value', '')
        for e in form_items.find_all('input', {'name': True})
    }
    post_data['login'] = args.login
    post_data['pass'] = args.password

    # Login
    session.post('https://www.ganjawars.ru/login.php', data=post_data)

    port_page = session.get(
        'http://www.ganjawars.ru/object.php?id=69403&page=oncoming1&sid=%s' %
        args.syndicateId)
    session.close()

    # Parse page
    soup = bs(port_page.text, "html.parser")
    battles_table = soup.find_all('table')[-1]
    battles_list = parse.make2d(battles_table)
    del battles_list[0]

    for battle in battles_list:
        del battle[3]
        syndicate_a, syndicate_b = str(battle[2]).split(' vs ', 1)

        if args.syndicateId + ' ' in syndicate_a:
            enemy_syndicate = syndicate_b
        else:
            enemy_syndicate = syndicate_a

        table_row = [battle[0], battle[1], enemy_syndicate]
        battles.append(table_row)

    print(
        tabulate(battles,
                 headers=['ВРЕМЯ', 'ФОРМАТ', 'ПРОТИВНИК'],
                 tablefmt='simple'))

Example #9

0

Show file

def assignment(crpcode):
    global df, table
    url_company = "https://opendart.fss.or.kr/api/list.json?crtfc_key={0}&corp_code={1}&bgn_de=20160101&end_de=20191231&pblntf_ty=A&pblntf_detail_ty=A002&page_no=1&page_count=10"
    url = url_company.format(apikey, crpcode)
    
    
    response = requests.get(url)
    output = json.loads(response.content)
    output_df=json_normalize(output['list'])
    company_code = output_df[output_df['report_nm']=='사업보고서 (2018.12)']['rcept_no'].iloc[0]
    
    url_parser = "https://opendart.fss.or.kr/api/document.xml?crtfc_key={0}&rcept_no="+company_code
    url = url_parser.format(apikey)
    
    webbrowser.open(url)
    time.sleep(3) #다운로드 시간 고려
    os.rename(path_to_download_folder+'/document.xml', path_to_download_folder+'/'+company_code+'.zip')
    
    os.chdir(path_to_download_folder)
    ex_zip=zipfile.ZipFile(company_code+'.zip')
    ex_zip.extractall()
    ex_zip.close()
    
    soup = BeautifulSoup(open((path_to_download_folder+'/'+company_code+'.xml'), 'rb'), 'html.parser')
    body=soup.find("body")
    table=body.find_all('table')
    
    for i in range(len(table)):
        a=pd.DataFrame(parser.make2d(table[i]))
    
        if a.iloc[0,0]=='재무상태표':
            df=pd.DataFrame(parser.make2d(table[i+1]))
            break


    df.columns = df.iloc[0]
    df=df.set_index(df.iloc[:,0])
    df=df.drop(df.index[0])
    df=df.drop(df.columns[0], axis=1)

Example #10

0

Show file

File: mlb_data_crawling.py Project: tenjumh/Practice

def crawling(url):
    # 셀레니움(selenium)으로 브라우저를 컨트를하기 위해서는 webdriver를 설치해야 한다
    # 구글크롬은 chromedriver로 검색하면 exe파일을 다운로드 할 수 있다.
    # webdriver 설치 위치를 정의한다.
    print(url)
    browser = webdriver.Chrome(
        'C:\Program Files (x86)\Google\Chrome\Application\chromedriver')
    # 불러오고자하는 url을 .get으로 호출한다.
    browser.get(url)

    # page_source파라미터를 이용하면 HTML정보를 가지고 온다.
    html = browser.page_source
    soup = BeautifulSoup(html, 'html.parser')  # 뷰티풀숩으로 HTML을 파싱하고 필요한 데이터 수집
    table_tags = soup.find_all(
        "table")  # find_all함수를 이용하면, TABLE태그로 지정된 곳만 뽑아서, 배열 형태로 저장
    table = table_tags[0]  # html상에서 table 순서, 첫번째 테이블 가지고 와야 함.
    p = parser.make2d(table)

    df_total = pd.DataFrame(p[1:], columns=p[0])  # 데이터 프레임으로 저장

    num = soup.find_all("button")[5].text  # 5번째 button에 페이지수 정보가 있음
    for j in range(1, int(num)):
        btn = browser.find_element_by_class_name('paginationWidget-next')
        btn.click()  # 버튼 클릭
        time.sleep(5)

        # 2페이지 이후 데이터 병합
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser')
        table_tags = soup.find_all("table")
        table = table_tags[0]
        p = parser.make2d(table)

        df = pd.DataFrame(p[1:], columns=p[0])
        df_total = pd.concat([df_total, df], 0)

    return df_total

Example #11

0

Show file

File: cal_finratio.py Project: yujin-dev/Stockmarket-Data

        def get_table_gen_from_nice(ticker, attr):

            # for tick in tick_df:
            try:
                url = "http://media.kisline.com/investinfo/mainInvestinfo.nice?paper_stock={}&nav=3".format(
                    ticker)
                res = req.get(url)

                bs_data = bs(res.content, "html.parser")
                table_data = bs_data.find("div", {"id": attr})

                yield pd.DataFrame(parser.make2d(table_data))  # , name

            except:
                pass

Example #12

0

Show file

File: collect_base_interest_rates.py Project: tariat/finance_data

def get_for_rate(nation, to="2019-12-31"):

    nation_list = {
        "KOR": "/central-bank-south-korea/bank-of-korea-interest-rate.aspx",
        "JPN": "/central-bank-japan/boj-interest-rate.aspx",
        "USA": "/central-bank-america/fed-interest-rate.aspx",
        "CHI": "/central-bank-china/pbc-interest-rate.aspx",
        "EUR": "/european-central-bank/ecb-interest-rate.aspx"
    }

    url = nation_list.get(nation, "-1")

    if url == "-1":
        print("국가명을 확인해주세요.")
        return -1

    url = "https://www.global-rates.com/interest-rates/central-banks{}".format(
        url)
    result = urlopen(url).read()
    soup = BeautifulSoup(result, "html.parser")
    table_tags = soup.find_all("table")

    # for idx,t in enumerate(table_tags):
    #     try:
    #         print("{}: ".format(idx),end="")
    #         print(t.find_all("h3")[0].text)
    #     except:
    #         pass

    table = parser.make2d(table_tags[18])
    df = pd.DataFrame(table[2:], columns=table[1])
    df.head()

    df["dt"] = df["change date"].astype("datetime64")
    df_range = pd.DataFrame({"dt": pd.date_range(df["dt"].min(), to)})

    df_1 = pd.merge(df_range, df, how="left")
    df_1["percentage"] = df_1["percentage"].fillna(method="ffill")
    df_1["percentage"] = df_1["percentage"].str.replace("\xa0%",
                                                        "").astype(float)

    df_1["nation"] = nation
    df_1 = df_1.drop("change date", 1)
    df_1 = df_1.rename(columns={"percentage": "base_rate"})

    time.sleep(3)

    return df_1

Example #13

0

Show file

File: test_html_table_parser.py Project: ojones/html_table_parser

    def test_html_table_parser(self):

        from bs4 import BeautifulSoup as bs
        from html_table_parser import parser_functions as parse

        soup = bs(mock_html_table(), "html.parser")
        test_table = soup.find('table')

        twod = parse.make2d(test_table)

        # two_col_data function is case insensitive
        self.assertEqual(parse.twod_col_data(twod, 'first name'), ['Eve', 'John', 'Adam', 'Jill'])
        # last name for first row is Eve because of colspan
        self.assertEqual(parse.twod_col_data(twod, 'lAst naMe'), ['Eve', 'Doe', 'Johnson', 'Smith'])
        # points for last row is 67 because of rowspan
        self.assertEqual(parse.twod_col_data(twod, 'POINTS'), ['94', '80', '67', '67'])

Example #14

0

Show file

File: naver_LNG_crawler.py Project: yujin-dev/Stockmarket-Data

def LNG_data_parsing():

    total_df = pd.DataFrame()
    for i in range(1, 472):
        res = get_response(i)
        parsed_html = parser.make2d(bs(res.content, "lxml").find("tbody"))
        df = pd.DataFrame(parsed_html).iloc[:, :2]
        df.set_index(0, inplace=True)
        df.columns = ["종가"]
        total_df = pd.concat([total_df, df])

    total_df.index.name = "date"
    total_df = total_df.astype(np.float32)
    total_df.index = total_df.index.map(lambda x: x.replace(".", "-"))
    total_df.to_csv("NG_naver_price.csv", encoding="cp949")
    return total_df

Example #15

0

Show file

File: scraping_example.py Project: priya-gitTest/dekegeek

def scrape_url(url):
    # Get the html of the page
    html = requests.get(url).text
    # Convert to BeautifulSoup object
    soup = BeautifulSoup(html, 'html.parser')
    # Find the table containing the statistics
    table = soup.find('table', {'border': '1', 'bgcolor': '#aaaaaa'})
    # Convert to two-dimensional table to make parsing easier
    twodim_table = parse.make2d(table)
    df = pd.DataFrame(twodim_table)
    # Set the column names and get rid of reprinted header rows throughout table
    df.columns = df.ix[0, :]
    df = df[df['#'] != '#']
    # Select only the players name, games played, and time on ice
    df = df[['Player Name', 'GP', 'TOI']]
    return df

Example #16

0

Show file

def Search():
    url = 'http://www.38.co.kr/html/fund/index.htm?o=k'

    data = urlopen(url).read()
    soup = BeautifulSoup(data, 'html.parser')

    table = soup.find("table", {'summary': '공모주 청약일정'})
    html_table = parser.make2d(table)

    df = pd.DataFrame(html_table[2:], columns=html_table[0])
    df['일정'] = df['공모주일정'].str[:10]
    df = df[df['일정'] >= today].sort_values(by='일정', ascending=True)
    df = df[['종목명', '일정', '공모주일정', '희망공모가', '주간사']].reset_index(drop=True)

    stock_new = list(df['종목명'].values)
    date_new = list(df['일정'].values)
    price_new = list(df['희망공모가'].values)
    company_new = list(df['주간사'].values)
    print(df)

    stock_old, date_old = Check()

    num = 1
    msg = []
    new_msg = False
    for date, stock, price, company in zip(date_new, stock_new, price_new,
                                           company_new):
        if (date not in date_old) and (stock not in stock_old):
            new_msg = True
            msg.append(
                str(num) + '. ' + stock + ' / ' + date + ' / ' + price +
                ' / ' + company + '\n\n')
        num += 1
    if new_msg == True:
        msg.append(url)
        msg_final = "".join(
            msg)  # 리스트 내부 인자들을 전부 합치기. ""안에 기호를 넣으면 기호 포함되어 합쳐짐
        bot.sendMessage(12345678, "<신규 공모주 청약 일정>" + "\n" + msg_final)

        # DB 저장
        DatatoSQL(df)
    else:
        print('신규 일정 없음')

Example #17

0

Show file

File: web_crawling_general.py Project: beaniejoy/do-it-quant

def crawl2(c_code):

    url = "http://comp.fnguide.com/SVO2/asp/SVD_Finance.asp?pGB=1&gicode=A" + str(
        c_code) + "&cID=&MenuYn=Y&ReportGB=&NewMenuID=103&stkGb=701"
    html = urlopen(url)
    bsObj = BeautifulSoup(html, "html.parser")

    tables = bsObj.find_all("table",
                            attrs={"class": "us_table_ty1 h_fix zigbg_no"})

    if len(tables) < 3:
        return None

    table = tables[2]
    html_table = parser.make2d(table)
    flag = True
    cnt = 0
    for row in html_table:
        if "이익잉여금" in row[0]:
            flag = False
            ri = cnt - 1
        # null 처리
        for i in range(len(row)):
            if row[i] == '':
                row[i] = None
        cnt += 1
    # 이익잉여금 정보 없으면 return
    if flag:
        return None

    df = pd.DataFrame(data=html_table[1:],
                      index=range(0,
                                  len(html_table) - 1),
                      columns=html_table[0])
    d = df.columns[len(df.columns) - 1]

    if df[d].iloc[ri] == None:
        return None
    # float 형식으로 변환
    df[d].iloc[ri] = str(df[d].iloc[ri])
    if ',' in str(df[d].iloc[ri]):
        df[d].iloc[ri] = df[d].iloc[ri].replace(',', '')
    return float(df[d].iloc[ri])

Example #18

0

Show file

File: test_html_table_parser.py Project: bparees/trianglemtb

    def test_html_table_parser(self):

        from bs4 import BeautifulSoup as bs
        from html_table_parser import parser_functions as parse

        soup = bs(mock_html_table(), "html.parser")
        test_table = soup.find('table')

        twod = parse.make2d(test_table)

        # two_col_data function is case insensitive
        self.assertEqual(parse.twod_col_data(twod, 'first name'),
                         ['Eve', 'John', 'Adam', 'Jill'])
        # last name for first row is Eve because of colspan
        self.assertEqual(parse.twod_col_data(twod, 'lAst naMe'),
                         ['Eve', 'Doe', 'Johnson', 'Smith'])
        # points for last row is 67 because of rowspan
        self.assertEqual(parse.twod_col_data(twod, 'POINTS'),
                         ['94', '80', '67', '67'])

Example #19

0

Show file

File: trianglemtb.py Project: bparees/trianglemtb

def get_trail_data():
    try:
        #s = urllib.request.urlopen("http://trianglemtb.com/trailstatus.php",None,5).read()
        s = urllib.request.urlopen("http://trianglemtb.com/trailstatus.php",
                                   None, 5).read()
        table = parse.make2d(bs(s, "html.parser"))
        return table, None
    except timeout:
        logging.error("error: socket timed out")
    except BaseException as error:
        print("error: url fetch: ", error)

    card_title = "Service Error"
    speech_output = "I'm sorry, there was an error accessing the triangle m.t.b. trail status page."
    should_end_session = True
    return None, build_response({},
                                build_speechlet_response(
                                    card_title, speech_output, None,
                                    should_end_session))

Example #20

0

Show file

def HTMLParse():
    # HTML 파싱
    url = "http://222.233.168.6:8094/RoomStatus.aspx"  # 도서관 자리 홈페이지
    result = urlopen(url)
    html = result.read()
    soup = BeautifulSoup(html, 'html.parser')  # BS를 이용해 HTML파싱
    temp = soup.find_all('table')

    # 데이터프레임 생성
    p = parser.make2d(temp[1])  # 데이터 프레임 생성
    df = pd.DataFrame(
        p[1:],
        columns=['a', 'b', 'Use', 'Avail', 'Rate', 'Wait', 'f', 'g'],
        index=['Man', 'Women', 'Adult', 'Adult2', 'Free', 'Notebook',
               'Sum'])  # 데이터프레임의 행렬 정의
    del df['a']
    del df['b']  # 쓸모없는 부분 삭제
    # print("parse complete")
    return df

Example #21

0

Show file

    def naver_fs_crawler(self):

        for iter_ in self.corp_sym_name_df.iterrows():
            print(iter_)
            corp_code, corp_name = iter_[1]["Symbol"], iter_[1]["Name"]

            url = "http://companyinfo.stock.naver.com/v1/company/cF1001.aspx?cmp_cd={}&fin_typ=0&freq_typ=Y".format(
                corp_code)

            res = req.get(url)
            html_ = bs(res.content, "lxml")

            table_1 = html_.table
            table_2 = parser.make2d(table_1)

            bs_data = pd.DataFrame(table_2).T
            bs_data = bs_data.set_index(1).T.set_index("주요재무정보")
            bs_data.index.name = corp_name, corp_code

            yield bs_data.iloc[:, 2:8]  # 제너레이터 형태로 반환

Example #22

0

Show file

File: dart_module.py Project: Parkhanjun/Python_crawling

def open_dart(code, number):
    ttr = {}
    for index, it in enumerate(company_code(code, number)):
        url = 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo=' + it
        html = requests.get(url).text
        title = bs(requests.get(url).text, 'html.parser').find('title')
        split = re.split('연결재무제표', html)[1].split(r');')[0].split(r'viewDoc(')[1].replace("'", "").split(', ')
        rurl = f'http://dart.fss.or.kr/report/viewer.do?rcpNo={split[0]}&dcmNo={split[1]}&eleId={split[2]}&offset={split[3]}&length={split[4]}&dtd={split[5]}'
        print(rurl)
        result = bs(requests.get(rurl).text, 'html.parser')
        re_title = result.select('html > body > table')[2].select_one('tbody > tr > td').findChild().text
        rt = str(re_title).replace(' ', '')
        print('re_title ==== ', rt)
        if rt == '연결손익계산서' or rt == '연결포괄손익계산서':
            tbody = str(result).split(re_title)[1]
        body = bs(tbody, 'html.parser')
        tr = body.find('table')
        table = parser.make2d(tr)
        ttr['a' + str(index)] = title.text.split('/')[0].replace('\n', '') + '_' + number + '분기'
        ttr['b' + str(index)] = data_set(table)
    return ttr

Example #23

0

Show file

File: events_example.py Project: ojones/html_table_parser

def fetch_html():

    html = ""

    page = requests.get(URL)
    html = page.content

    return html


if __name__ == '__main__':

    events = []

    soup = bs(fetch_html(), "html.parser")
    event_table = soup.find_all('table')[1]

    # using text_only false because we want soup cells in order to reference href attr
    twod = parse.make2d(event_table, text_only=False)

    for row in twod[2:]:
        # using event_dict to explicitly set custom column keys
        events.append(event_dict(*row))

    pp.pprint(events)

Example #24

0

Show file

File: web_crawling_general.py Project: feelcard/Do_IT_Quant

def crawl(j_code, c_name):
    url = "http://comp.fnguide.com/SVO2/asp/SVD_Main.asp?pGB=1&gicode=A" + str(j_code) +"&cID=&MenuYn=Y&ReportGB=&NewMenuID=101"
    html = urlopen(url)
    bsObj = BeautifulSoup(html, "html.parser")
    # 웹사이트에서 필요한 부분을 가져오는 코드
    div = bsObj.find_all("div", attrs={"id":"div15"})

    # 재무 정보를 제공을 안한 경우 --> [제외]
    if len(div) < 1:
        return "no_info"

    no_data = bsObj.find_all("div", attrs={"id":"divNotData"})
    if len(no_data) > 0:
        no_data = no_data[0].find_all("div", attrs={"class":"um_notdata"})
        if len(no_data) > 0:
            if "재무정보를 제공하지 않습니다." in no_data[0]:
                return "no_info"

    div = div[0]
    # 연결 - 전체 데이터
    table = div.find_all("table", attrs={"class":"us_table_ty1 h_fix zigbg_no"})[0]

    html_table = parser.make2d(table)

    # 예외 처리
    if (len(html_table[0]) != len(html_table[1])):
        if len(html_table[0]) > len(html_table[1]):
            n = len(html_table[0]) - len(html_table[1])
            html_table[0] = html_table[0][:-1*n]
        else:
            n = len(html_table[1]) - len(html_table[0])
            for i in range(n):
                html_table[0].append('Net Quarter')

    # 디버깅용 print ^_^
    print(j_code, c_name)
    df = pd.DataFrame(data=html_table[1:], index=range(0, len(html_table)-1), columns=html_table[0])
    del df['Net Quarter']
    dfl = df.values.tolist()

    # 예외 처리
    if len(dfl) < 1:
        return "no_info"

    for i, date in enumerate(dfl[0]):
        if "(E)" in date:
            dfl[0][i] = date[26:]
            if "(E)" in dfl[0][i]:
                dfl[0][i] = dfl[0][i][:-3]
        if "(P)" in date:
            dfl[0][i] = date[24:]
            if "(P)" in dfl[0][i]:
                dfl[0][i] = dfl[0][i][:-3]

    # null 처리
    for l in dfl[1:]:
        for i in range(len(l)):
            if l[i] == '':
                l[i] = None

    df = pd.DataFrame(data=dfl[1:], index=range(0, len(dfl)-1), columns=dfl[0])
    df.name = c_name

    # 예외 처리: 데이터가 없는 경우..
    if len(dfl[0]) < 2:
        return "no_info"
    if len(dfl) < 2:
        return "no_info"

    return df

Example #25

0

Show file

File: test2.py Project: chado13/crawling

        t.rpt_nm.string,t.rcp_no.string,t.flr_nm.string, t.rcp_dt.string,
        t.rmk.string]]),
        columns=["crp_cls","crp_nm","crp_cd","rpt_nm","rcp_no","flr_nm",
        "rcp_dt","rmk"])
    data = pd.concat([data,temp])

data = data.reset_index(drop=True)
user_num=int(input("몇 번째 보고서를 확인하시겠습니까?"))

url = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo="+data["rcp_no"][user_num]

req=requests.get(url).text

tree=lxml.html.fromstring(req)

onclick=tree.xpath('//*[@id="north"]/div[2]/ul/li[1]/a')[0].attrib['onclick']
pattern=re.compile("^openPdfDownload\('\d+',\s*'(\d+)'\)")
dcm_no=pattern.search(onclick).group(1)
url_parsing="http://dart.fss.or.kr/report/viewer.do?rcpNo="+data['rcp_no'][user_num]+"&dcmNo="+dcm_no+"&eleId=15&offset=1489233&length=105206&dtd=dart3.xsd"


report=urlopen(url_parsing)
r=report.read()


xmlsoup_another=bs(r,'html.parser')
body = xmlsoup_another.find("body")
table=body.find_all("table")
p=parser.make2d(table[3])

Example #26

0

Show file

resultset = [[]]

with codecs.open('port_list.html', 'r', encoding='cp1251',
                 errors='ignore') as fd:
    for line in fd:
        line = line.lstrip()
        line = line.replace("&nbsp;", " ")
        doc += line.replace("\n", " ")

soup = bs(doc, "html.parser")
test_table = soup.find('table', {
    'cellspacing': '1',
    'cellpadding': '5',
    'width': '100%'
})
twod_array = parse.make2d(test_table)
twod_array[0] = ['Время', 'Формат', 'Участники', 'Контроль']

for battle in twod_array:
    i = 0
    del battle[3]
    if battle[2] != 'Участники':
        sind1, sind2 = str(battle[2]).split(' vs ', 1)
        sind_1 = sind1.split(' ', 1)
        sind_2 = sind2.split(' ', 1)
        tlist = [
            battle[0], battle[1], sind_1[0].replace('#', ''), sind_1[1],
            sind_2[0].replace('#', ''), sind_2[1]
        ]
        resultset.append(tlist)
        i += i

Example #27

0

Show file

File: download_fss.py Project: docskky/dc

def get_fss(rcp_no):
    url1 = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo={}"

    retry_cnt = 0
    while retry_cnt < 20:
        try:
            page = None
            try:
                page = urlopen(url1.format(rcp_no))
            except:
                retry_cnt += 1
                continue
            html = page.read().decode('utf-8')
            # viewDoc('11111', '22222', 함수를 검색해서 dcm_no 값을 추출한다.
            result = re.search(r'viewDoc\(\'(.*)\', \'(.*)\',', html)

            dcm_no = result.group(2)

            url2 = "http://dart.fss.or.kr/report/viewer.do?rcpNo={}&dcmNo={}&eleId=15&offset=297450&length=378975&dtd=dart3.xsd".format(
                rcp_no, dcm_no)
            page = None
            try:
                page = urlopen(url2)
            except:
                retry_cnt += 1
                continue

            r = page.read()
            xmlsoup = BeautifulSoup(r, 'html.parser')
            body = xmlsoup.find("body")
            tables = body.find_all("table")

            head = parser.make2d(tables[0])
            dates = []
            for grp in range(1, len(head) - 1):
                dstr = re.search(r'(\d+.\d+.\d+)', head[grp][0])
                date = datetime.datetime.strptime(dstr.group(1),
                                                  "%Y.%m.%d").date()
                dates.append(date)

            if len(dates) == 0:
                return

            info_tbl = {}
            totals = parser.make2d(tables[1])

            # 자산
            for idx in range(0, len(totals)):
                list = []
                for grp in range(1, len(dates) + 1):
                    list.append(strip_money(totals[idx][grp]))
                info_tbl[totals[idx][0].strip()] = list

            strip_money(totals[idx][2])
            # 당기순이익
            profits = parser.make2d(tables[3])
            for idx in range(0, len(profits)):
                list = []
                for grp in range(0, len(dates)):
                    list.append(strip_money(profits[idx][1 + grp * 2]))
                info_tbl[profits[idx][0].strip()] = list

            return dates, info_tbl

        except:
            #print("error:", sys.exc_info()[0])
            pass

    return

Example #28

0

Show file

File: 307_dividend.py Project: tariat/finance_data

def get_div_data(browser, last_num, file_nm):
    search_btn = browser.find_element_by_id("image1")
    search_btn.click()

    # html소스를 가져와서, 원하는 위치를 찾습니다.
    html = browser.page_source

    from bs4 import BeautifulSoup
    from html_table_parser import parser_functions as parser
    import pandas as pd

    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", attrs={"id": "grid1_body_table"})
    p = parser.make2d(table)
    df = pd.DataFrame(p[2:], columns=p[1])
    df.head()

    import time
    import random
    from tqdm import tqdm

    prev_no = 0
    prev_table = None

    for i in tqdm(range(0, 200)):

        try:
            next_btn = browser.find_element_by_id("cntsPaging01_next_btn")
            next_btn.click()
        except:
            time.sleep(2)
            try:
                next_btn = browser.find_element_by_id("cntsPaging01_next_btn")
                next_btn.click()
            except:
                time.sleep(2)
                next_btn = browser.find_element_by_id("cntsPaging01_next_btn")
                next_btn.click()

        def get_html(browser, cnt):

            if cnt >= 4:
                return -1, -1

            html = browser.page_source
            soup = BeautifulSoup(html, 'html.parser')

            cur_no = soup.find(
                "a",
                attrs={
                    "class":
                    "w2pageList_control_label w2pageList_label_selected"
                })
            cur_no = cur_no.text

            table = soup.find("table", attrs={"id": "grid1_body_table"})

            if cur_no != prev_no and prev_table != table:
                return cur_no, table
            else:
                time.sleep(1)
                get_html(browser, cnt + 1)

        cur_no, table = get_html(browser, 1)

        if cur_no == -1:
            print("\n종료. 테이블 정보가 바뀌지 않았습니다.")
            break

        p = parser.make2d(table)
        temp = pd.DataFrame(p[2:], columns=p[1])
        df = pd.concat([df, temp], 0)
        prev_no = cur_no
        prev_table = html

        if cur_no == str(last_num):
            print("\n최종 페이지 도달")
            break
        time.sleep(random.randrange(3, 5))
        df.to_pickle(file_nm)

Example #29

0

Show file

# latestsnelist=pd.read_table('recentsnelist.txt')
# latestsnelist=pd.read_table('recentlist.txt')	#,names=colnames,data_start=1,guess='False')
# latestsnelist=ascii.read('recentsnelist.txt',delimiter='\t')	#,names=colnames,data_start=1,guess='False')
imsnglist = ascii.read('/data7/cschoi/IMSNG/target/alltarget.dat')

urlall = "http://www.RochesterAstronomy.org/snimages/sndateall.html"  # sn date all
url = 'http://www.rochesterastronomy.org/snimages/sndate.html'  # sndate

print('getting table data from web page from', url)
response = requests.get(url)
print('Done, table data is obtained')
soup = BeautifulSoup(response.content, 'html.parser')
tbl = soup.find_all('table')

soup.find_all('table')[1].find_all('th')
html_table = parser.make2d(tbl[1])
df = pd.DataFrame(html_table[1:], columns=html_table[0])

latestsnelist = df

print('getting table data from web page from', urlall)
responseall = requests.get(urlall)
print('Done, table data is obtained')
soupall = BeautifulSoup(responseall.content, 'html.parser')
tblall = soupall.find_all('table')

soupall.find_all('table')[1].find_all('th')
html_tableall = parser.make2d(tblall[1])
dfall = pd.DataFrame(html_tableall[1:], columns=html_tableall[0])

latestsnelistall = dfall

Example #30

0

Show file

File: dart.py Project: Geenie-Lee/week3

def fs_table():  # 검색된 전체 사업보고서 기반 재무제표 추출 함수
    data = searching_report()
    document_count = 0

    for i in range(len(data)):
        MAIN_URL = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo=" + data[
            'rcp_no'][document_count]
        print(MAIN_URL)

        page = BeautifulSoup(urlopen(MAIN_URL).read(), 'html.parser')
        body = str(page.find('head'))

        if len(body.split('연결재무제표",')) <= 1:  # 연결재무제표 & 재무제표 탐색 시작
            if len(body.split('연 결 재 무 제 표",')) >= 2:
                body = body.split('연 결 재 무 제 표",')[1]  # "연 결 재 무 제 표" 로 발견
                print_page_1 = '연 결 재 무 제 표'
            else:
                if len(body.split('재무제표",')) <= 1:  # 연결재무제표가 없다면 재무제표 로 탐색 시작
                    if len(body.split('재 무 제 표",')) >= 2:
                        body = body.split('재 무 제 표",')[1]  # "재 무 제 표" 로 발견
                        print_page_1 = '재 무 제 표'
                    else:
                        print("### Failed. (연결재무제표/재무제표 페이지 탐색 실패.)")
                        return 0  # 아무것도 발견하지 못할 시 프로그램 종료.
                else:
                    body = body.split('재무제표",')[1]  # "재무제표" 로 발견
                    print_page_1 = '재무제표'
        else:
            body = body.split('연결재무제표",')[1]  # "연결재무제표" 로 발견
            print_page_1 = '연결재무제표'

        body = body.split('cnt++')[0].split('viewDoc(')[1].split(')')[0].split(
            ', ')
        body = [body[i][1:-1]
                for i in range(len(body))]  # 찾아낸 재무제표 페이지로 이동하기 위한 url구성 번호 파싱
        VIEWER_URL = "http://dart.fss.or.kr/report/viewer.do?rcpNo=" + body[0] \
                     + '&dcmNo=' + body[1] + '&eleId=' + body[2] + '&offset=' + body[3] \
                     + '&length=' + body[4] + '&dtd=dart3.xsd'
        print(VIEWER_URL)

        page = BeautifulSoup(urlopen(VIEWER_URL).read(), 'html.parser')
        if len(str(page.find('body')).split('재 무 상 태 표')) == 1:  # 재무상태표 탐색 시작
            if len(str(page.find('body')).split(
                    '재무상태표')) <= 1:  # 재무상태표를 찾아내지 못한다면 프로그램 종료
                print("### Failed. (재무상태표 탐색 실패.)")
                return 0
            else:
                body = str(page.find('body')).split('재무상태표')[1]  # "재무상태표" 로 발견
                print_page_2 = '재무상태표'
        else:
            body = str(
                page.find('body')).split('재 무 상 태 표')[1]  # "재 무 상 태 표" 로 발견
            print_page_2 = '재 무 상 태 표'

        body = BeautifulSoup(body, 'html.parser')  # 찾아낸 재무상태표를 읽어내기 위해 파싱

        print(print_page_1 + " - " + print_page_2)
        print(body.find(align='RIGHT').text)

        table = body.find_all('table')  # table 태그 탐색
        if len(table) <= 1:  # 탐색 실패시 프로그램 종료
            print("### Failed. (there's no table.)")
            return 0

        p = parser.make2d(table[0])
        table = pd.DataFrame(p[1:], columns=p[0])
        table = table.set_index(p[0][0])

        table.to_csv('C:\\Users\\admin\\Desktop\\Test_Result\\' +
                     print_page_1 + "_" + print_page_2 + '_' +
                     str(document_count) + '.csv',
                     encoding='cp949')
        document_count += 1

    return table

Example #31

0

Show file

File: ELS_data.py Project: okeycsy/20-2-Capstone-Hedge-

def fnc_table(rcp):
    rcp_no = rcp
    url = "http://dart.fss.or.kr/dsaf001/main.do?rcpNo=" + rcp_no
    page = BeautifulSoup(urlopen(url).read(),
                         'html.parser',
                         from_encoding='utf-8')
    body = str(page.find('head'))
    body = body.split('【 본    문 】",')[1]
    body = body.split('cnt++')[0]
    body = body.split('viewDoc(')[1]
    body = body.split(')')[0]
    body = body.split(', ')
    body = [body[i][1:-1] for i in range(len(body))]

    url_final = 'http://dart.fss.or.kr/report/viewer.do?rcpNo=' + body[
        0] + '&dcmNo=' + body[1] + '&eleId=' + body[2] + '&offset=' + body[
            3] + '&length=' + body[4] + '&dtd=dart3.xsd'

    #print(url_final)

    data = pd.DataFrame()

    page = BeautifulSoup(urlopen(url_final).read(), 'html.parser')
    body = str(page).split('(2) 모집 또는 매출의 개요')[1]
    body = BeautifulSoup(body, 'html.parser')

    table1 = body.find_all("table")
    p = parser.make2d(table1[0])
    table1 = pd.DataFrame(p[0:], columns=["content", "content1", "내용"])

    table1 = table1.convert_dtypes()
    table1.content = table1.content.str.replace('\s+', '')
    #print(table1)

    table1['bool'] = table1.iloc[:, 0].apply(
        lambda x: '종목명' in x or '기초자산' in x or '발행일' in x or '만기일' in x)
    table1 = table1[table1['bool'] == True]
    table1 = table1.reset_index(drop=True)

    #print(table1)

    pdct_nm = table1.loc[0, "내용"]
    pdct_nm = re.sub(r'\([^)]*\)', '', pdct_nm)
    pdct_asset = table1.loc[1, "내용"]
    st_date = table1.loc[2, "content1"]
    exp_date = table1.loc[3, "content1"]
    exp_date = re.sub(r'\([^)]*\)', '', exp_date)

    temp = pd.DataFrame(([[rcp_no, pdct_nm, pdct_asset, st_date, exp_date]]),
                        columns=["문서번호", "상품명", "기초자산", "발행일", "만기일"])
    data = pd.concat([data, temp])

    try:
        body = str(page).split('최대이익액 및')[1]
    except:
        try:
            body = str(page).split('최소이익액 및')[1]
        except:
            body = str(page).split('최대손실액 및')[1]

    body = BeautifulSoup(body, 'html.parser')
    table2 = body.find_all("table")
    p = parser.make2d(table2[0])
    table2 = pd.DataFrame(p[0:], columns=["구분", "내용", "수익률"])
    table2['bool'] = table2.iloc[:, 0].apply(
        lambda x: '최대손실액' in x or '최대이익액' in x or '최소이익액' in x)
    table2 = table2[table2['bool'] == True]

    loss_max = table2['구분'] == '최대손실액'
    earn_min = table2['구분'] == '최소이익액'

    min = table2[loss_max | earn_min]
    min = min.reset_index(drop=True)
    min_rate = min.loc[0, "수익률"]

    temp_max = table2['구분'] == '최대이익액'
    max = table2[temp_max]
    max = max.reset_index(drop=True)
    max_rate = max.loc[0, "수익률"]

    earn_temp = pd.DataFrame(([[min_rate, max_rate]]),
                             columns=["최소수익", "최대수익"])
    data = pd.concat([data, earn_temp], axis=1)
    #print(min_rate)
    return data

Example #32

0

Show file

File: [기업 직원보수 및 근무 현황(사업보고서기준)].py Project: gj4241/gongsi-Bot

for i in range(0, len(report_df)):
    crp_cd, crp_cls, crp_nm, rcp_dt, rcp_dt, rcp_no, rpt_nm, period = make_object(
        report_df, i)
    soup = fs.financial_statements_soup(rcp_no, name='임원 및 직원의 현황')
    ob_, ob_unit = financial_statements_data(soup, name='.*임.*원.*현.*황.*')
    for k in range(0, len(soup.find_all('table'))):
        find = str(soup.find_all('table')[k])
        if re.search('의결권', find):
            ob_soup = soup.find_all('table')[k]
            ob_soup = BeautifulSoup(str(ob_soup), 'lxml')
        elif re.search('직 원 수', find):
            yb_soup = soup.find_all('table')[k]
            yb_soup = BeautifulSoup(str(yb_soup), 'lxml')

    p = parser.make2d(yb_soup)
    emp = pd.DataFrame(p, columns=p[0])
    emp.columns = emp.loc[1]
    sex_ratio(emp, '1')
    earning(emp, '2')
    years(emp, '3')
    count_emp(emp, '4')
    p = parser.make2d(ob_soup)
    emp = pd.DataFrame(p, columns=p[0])
    emp.columns = emp.loc[1]
    ob_ages(emp, '5')
    msg = MIMEMultipart('related')
    msg['Subject'] = Header('[test]' + crp_nm + ' 사업보고서 임직원 현황 시각화 테스트중',
                            'utf-8')
    msg['From'] = formataddr((str(Header(u'M.Robo',
                                         'utf-8')), '*****@*****.**'))

Example #33

0

Show file

File: example.py Project: ojones/html_table_parser

from bs4 import BeautifulSoup as bs
from html_table_parser import parser_functions as parse
from html_table_parser.tests import test_html_table_parser as test
import pprint
pp = pprint.PrettyPrinter(indent=4, width=120)

__author__ = 'oswaldjones'


if __name__ == '__main__':

    soup = bs(test.mock_html_table(), "html.parser")
    test_table = soup.find('table')

    twod_array = parse.make2d(test_table)

    # print 2D array
    pp.pprint(twod_array)

    # print column data by col heading name (case insensitive)
    pp.pprint(parse.twod_col_data(twod_array, 'first name'))
    pp.pprint(parse.twod_col_data(twod_array, 'lAst naMe'))

    # row data begins on first row after col headings
    # so rowstart is 1
    pp.pprint(parse.make_dict(test_table, 1))

Example #34

0

Show file

File: main.py Project: chickpiz/pokedex_chickpiz

    def crawl_basics(self):
        text = get_html("https://pokemon.fandom.com/ko/wiki/파르셀_(포켓몬)")
        soup = BeautifulSoup(text, features="html.parser")

        body = []
        body_table = [[], []]

        for div in soup("div"):
            if "class" in div.attrs:
                # 포켓몬의 이름
                if "name-ko" in div["class"]:
                    self.pokemon.Basic.name = div.text.strip()

                # 도감 번호
                if "index" in div["class"]:
                    self.pokemon.Basic.number = div.text[3:]  # No. 제거

        # 기본 정보가 담긴 표(body_table) 추출
        tables = soup.find_all("table")
        for table in tables:
            if "class" in table.attrs:
                if "body" in table.attrs["class"]:
                    body = parser_functions.make2d(table)
                    print(body)

        type_i = 0
        j = 1
        for item in body:
            if type_i % 2 == j % 2:
                for values in item:
                    if "도감 번호" in values:
                        body_table[0].append(values)
                        j += 0.5
                    else:
                        body_table[1].append(values)
            else:
                for columns in item:
                    body_table[0].append(columns)
            type_i += 1

        print(body_table)

        # 표에서 정보 추출
        for key in body_table[0]:
            index = body_table[0].index(key)
            if key == "타입":
                type_i = 0
                while True:
                    type_i += 1
                    if body_table[1][index][0:type_i] in TYPES:
                        self.pokemon.Basic.types.append(
                            body_table[1][index][0:type_i])
                        self.pokemon.Basic.types.append(
                            body_table[1][index][type_i:])
                        break
            elif key == "분류":
                self.pokemon.Basic.species = body_table[1][index]
            elif key == "특성":
                abilities_i = 0
                while True:
                    abilities_i += 1
                    if body_table[1][index][0:abilities_i] in ABILITIES:
                        self.pokemon.Battle.ord_abilities.append(
                            body_table[1][index][0:abilities_i])
                        self.pokemon.Battle.ord_abilities.append(
                            body_table[1][index][abilities_i:])
                        break
        print(self.pokemon.Basic.types)
        print(self.pokemon.Battle.ord_abilities)

Example #35

0

Show file

File: [BigContest]Future's Code.py Project: decision-J/Bigcon_futures

        aodkor = AODlist.kor[k]

        for i in range(15) :
            airport = my_dict_ARP.loc[i,'url_airport']

            url = base_url + '&depArr=' + depArr + '&current_date=' + date + '&airport=' + airport + '&al_icao=&fp_id='
            driver = webdriver.Chrome('C:/Users/Danah/Documents/chromedriver_win32/chromedriver.exe')
            driver.implicitly_wait(5)
            driver.get(url)

            html = driver.page_source
            soup = bs(html, 'html.parser')
            temp = soup.find_all('table')
            driver.close()

            p = parser.make2d(temp[1])
            df = pd.DataFrame(p[0:],columns=['FLO_kor', '', 'FLT', '', 'ODP_kor', '', 'STT', '', 'Expected Time', '', 'ATT', '', 'Type', '', 'DLY'])
            df = df.iloc[::2,::2]
            df = df.drop(['Expected Time', 'Type', 'ATT'], axis=1)

            col = list(df.columns)
            df.insert(col.index('ODP_kor'), 'ODP', df['ODP_kor'].map(my_dict_ARP.set_index('kor')['eng']))
            df = df.drop('ODP_kor', axis=1)
            df.insert(col.index('FLO_kor'), 'FLO', df['FLO_kor'].map(my_dict_FLO.set_index('kor')['eng']))
            df = df.drop('FLO_kor', axis=1)
            df.DLY = list([0 if df['DLY'].iloc[i] == aodkor else 1 if df['DLY'].iloc[i] == '지연' else np.nan for i in range(0, len(df))])
            df = df.dropna(axis=0)

            df['ARP'] = np.repeat(my_dict_ARP.loc[i,'eng'], df.shape[0])
            df['SDT_YY'] = np.repeat(2019, df.shape[0])
            df['SDT_MM'] = np.repeat(9, df.shape[0])