Example #1
0
def test_invalid_flavor():
    url = "google.com"
    flavor = "invalid flavor"
    msg = r"\{" + flavor + r"\} is not a valid set of flavors"

    with pytest.raises(ValueError, match=msg):
        read_html(url, "google", flavor=flavor)
Example #2
0
def test_parse_failure_rewinds():
    # Issue #17975
    _skip_if_no('lxml')
    _skip_if_no('bs4')

    class MockFile(object):
        def __init__(self, data):
            self.data = data
            self.at_end = False

        def read(self, size=None):
            data = '' if self.at_end else self.data
            self.at_end = True
            return data

        def seek(self, offset):
            self.at_end = False

        def seekable(self):
            return True

    good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
    bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')

    assert read_html(good)
    assert read_html(bad)
Example #3
0
def test_invalid_flavor():
    url = "google.com"
    flavor = "invalid flavor"
    msg = r"\{" + flavor + r"\} is not a valid set of flavors"

    with tm.assert_raises_regex(ValueError, msg):
        read_html(url, "google", flavor=flavor)
Example #4
0
 def test_parse_dates_combine(self):
     raw_dates = Series(date_range('1/1/2001', periods=10))
     df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
                     'time': raw_dates.map(lambda x: str(x.time()))})
     res = read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
                     index_col=1)
     newdf = DataFrame({'datetime': raw_dates})
     tm.assert_frame_equal(newdf, res[0])
Example #5
0
 def test_multiple_header_rows(self):
     # Issue #13434
     expected_df = DataFrame(data=[("Hillary", 68, "D"),
                                   ("Bernie", 74, "D"),
                                   ("Donald", 69, "R")])
     expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
                            ["Name", "Unnamed: 1_level_1",
                             "Unnamed: 2_level_1"]]
     html = expected_df.to_html(index=False)
     html_df = read_html(html, )[0]
     tm.assert_frame_equal(expected_df, html_df)
Example #6
0
def test_parse_failure_unseekable():
    # Issue #17975
    _skip_if_no('lxml')

    class UnseekableStringIO(StringIO):
        def seekable(self):
            return False

    good = UnseekableStringIO('''
        <table><tr><td>spam<br />eggs</td></tr></table>''')
    bad = UnseekableStringIO('''
        <table><tr><td>spam<foobr />eggs</td></tr></table>''')

    assert read_html(good)
    assert read_html(bad, flavor='bs4')

    bad.seek(0)

    with pytest.raises(ValueError,
                       match='passed a non-rewindable file object'):
        read_html(bad)
Example #7
0
 def getBasic(cls,stk_id):
     # stk_id can be either stock id or stock name
     try:
         res=urlopen(cls.getUrlStock(stk_id))
     except Exception as ex:
         print 'error: %s' % (ex)
         return None
     content=res.read().decode('utf-8')
     strErase=findall("<tr class='tblHead'>.*\n.*</div>",content)[0]
     content=content.replace(strErase,'')
     df=read_html(content)[1]
     return df
Example #8
0
    def test_keep_default_na(self):
        html_data = """<table>
                        <thead>
                            <th>a</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                            <td> N/A</td>
                            </tr>
                            <tr>
                            <td> NA</td>
                            </tr>
                        </tbody>
                    </table>"""

        expected_df = DataFrame({'a': ['N/A', 'NA']})
        html_df = read_html(html_data, keep_default_na=False)[0]
        tm.assert_frame_equal(expected_df, html_df)

        expected_df = DataFrame({'a': [np.nan, np.nan]})
        html_df = read_html(html_data, keep_default_na=True)[0]
        tm.assert_frame_equal(expected_df, html_df)
Example #9
0
    def _option_frames_from_url(self, url):
        frames = read_html(url)
        nframes = len(frames)
        frames_req = max(self._TABLE_LOC.values())
        if nframes < frames_req:
            raise RemoteDataError("%s options tables found (%s expected)" % (nframes, frames_req))

        if not hasattr(self, 'underlying_price'):
            try:
                self.underlying_price, self.quote_time = self._get_underlying_price(url)
            except IndexError:
                self.underlying_price, self.quote_time = np.nan, np.nan

        calls = self._process_data(frames[self._TABLE_LOC['calls']], 'call')
        puts = self._process_data(frames[self._TABLE_LOC['puts']], 'put')

        return {'calls': calls, 'puts': puts}
Example #10
0
    def test_na_values(self):
        # GH 13461
        html_data = """<table>
                        <thead>
                            <th>a</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                            <td> 0.763</td>
                            </tr>
                            <tr>
                            <td> 0.244</td>
                            </tr>
                        </tbody>
                    </table>"""

        expected_df = DataFrame({'a': [0.763, np.nan]})
        html_df = read_html(html_data, na_values=[0.244])[0]
        tm.assert_frame_equal(expected_df, html_df)
Example #11
0
    def test_converters(self):
        # GH 13461
        html_data = """<table>
                        <thead>
                            <th>a</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                            <td> 0.763</td>
                            </tr>
                            <tr>
                            <td> 0.244</td>
                            </tr>
                        </tbody>
                    </table>"""

        expected_df = DataFrame({'a': ['0.763', '0.244']})
        html_df = read_html(html_data, converters={'a': str})[0]
        tm.assert_frame_equal(expected_df, html_df)
Example #12
0
    def grab_result(self):
	print('grabbing results...')
        #self.inspect("results_aftersleep")
        #form = self.driver.find_element_by_xpath('//*[@id="simOutput"]')
        #stattab = WebDriverWait(form, 1000).until(EC.presence_of_element_located((By.ID, "test-statsBtn")))
        #stattab = form.find_element_by_xpath('//*[@id="test-statsBtn"]')
        stattab = None
        time0 = 0
        while not stattab and time0 < 1000:
            try:
                stattab = self.driver.find_element_by_xpath('//*[@id="test-statsBtn"]')
            except:
                sleep(1)
                time0 += 1
        stattab.click()
        sleep(1)
        form = self.driver.find_element_by_xpath('//*[@id="statsTab"]')
        table = form.find_element_by_xpath('//*[@id="pnlStats"]/div/div/div/div/div/table').get_attribute('outerHTML')
        self.data = read_html(table)[0]
        col= self.data.columns[:-1]
        self.data.drop(self.data.columns[0],axis=1,inplace=True)
        self.data.columns = col
        print (self.data)
Example #13
0
    def _option_frames_from_url(self, url):
        frames = read_html(url)
        nframes = len(frames)
        frames_req = max(self._TABLE_LOC.values())
        if nframes < frames_req:
            raise RemoteDataError("%s options tables found (%s expected)" % (nframes, frames_req))

        if not hasattr(self, 'underlying_price'):
            try:
                self.underlying_price, self.quote_time = self._underlying_price_and_time_from_url(url)
            except IndexError:
                self.underlying_price, self.quote_time = np.nan, np.nan

        calls = frames[self._TABLE_LOC['calls']]
        puts = frames[self._TABLE_LOC['puts']]

        if len(calls) == 0 or len(puts) == 0:
            raise RemoteDataError('Received no data from Yahoo at url: %s' % url)

        calls = self._process_data(calls, 'call')
        puts = self._process_data(puts, 'put')

        return {'calls': calls, 'puts': puts}
def table_to_dataframe(bs_table, **kwargs):
    df = read_html(str(bs_table), header=0, infer_types=True)[0]
    df.columns = map( lambda x: regex.findall(x)[0], df.columns) #strips annoying <sup> numbers
    #data cleaning
    df[df.columns[-1]] = df[df.columns[-1]].apply( lambda x:x[1:11])
    df[df.columns[-3]] = df[df.columns[-3]].apply( lambda x:x[1:11])
    for key,item in kwargs.iteritems():
        df[key] = item
    try:
      df['End of Term'] = df['Mandatory retirement']
      del df['Mandatory retirement']
    except Exception as e:
      pass
    try:
      df['End of Term'] = df['Retired']
      del df['Retired']
    except:
      pass
    try:
      del df['']
    except:
      pass
    return df
Example #15
0
                             "MaximalMonthOf18", "Mean5yr", "Mean12/13-17/18"])
    nextpagebutton = driver.find_element_by_xpath('//button[@id="nextLot"]')
    backyearbutton = driver.find_element_by_xpath('//span[@id="wr_backInTime"]')
    finalyearbutton = driver.find_element_by_xpath('//span[@id="wr_forwardToEnd"]')
    muteswanpage = 1
    totalmuteswanpages = int(driver.find_element_by_xpath('//select[@id="pageNo"]'
                                                          '/option[last()]').get_attribute("value"))

    while True:
        print(" - Processing page {}...".format(muteswanpage))
        finalyearbutton.click()
        table = driver.find_element_by_xpath('//table[@class="maintable"]'
                                             '/tbody[@id="wr_webs_report"]'
                                             '/..'
                                             '/..')
        table = read_html(table.get_attribute("innerHTML"))[0]
        for i in range(3):
            for j in range(5):
                backyearbutton.click()
            table = driver.find_element_by_xpath('//table[@class="maintable"]'
                                                 '/tbody[@id="wr_webs_report"]'
                                                 '/..'
                                                 '/..')
            table = read_html(table.get_attribute("innerHTML"))[0]
            muteswantable = pd.concat([table, table[table.columns[2:7]]], axis=1)
        cols = muteswantable.columns.tolist()
        cols = [cols[0]] + cols[12:] + cols[2:7] + cols[8:11]
        muteswantable = muteswantable[cols]
        muteswantable = np.object_(muteswantable)

        for row in muteswantable:
Example #16
0
def test_same_ordering():
    _skip_if_none_of(['bs4', 'lxml', 'html5lib'])
    filename = os.path.join(DATA_PATH, 'valid_markup.html')
    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
    assert_framelist_equal(dfs_lxml, dfs_bs4)
Example #17
0
 def read_html(self, *args, **kwargs):
     kwargs['flavor'] = self.flavor
     return read_html(*args, **kwargs)
Example #18
0
# extract table from the class attribute 
from pandas.io.html import read_html
page = 'https://www.ft.com/content/691390ca-53d9-11ea-90ad-25e377c0ee1f?fbclid=IwAR3TfgUYCgwsZLN-ad-GnFN7lUcUEurB86SHRHVJewO6ZkL3XrwMGjxzJm4'

tables = read_html(page, attrs={"class":"o-table"})

file_name = './my_file.csv'

tables[0].to_csv(file_name, sep=',')

print ("Extracted {num} lines".format(num=len(tables)))

print(tables)

#
# Exemple avec beautiful SOUP puis conversion en dataframe
#

# import requests
# from bs4 import BeautifulSoup
# import pandas as pd

# website_url = requests.get('https://www.ft.com/content/691390ca-53d9-11ea-90ad-25e377c0ee1f?fbclid=IwAR3TfgUYCgwsZLN-ad-GnFN7lUcUEurB86SHRHVJewO6ZkL3XrwMGjxzJm4').text

# soup = BeautifulSoup(website_url,'lxml')

# My_table = soup.find('table',{'class':'o-table o-table--row-stripes o-table--compact o-table--responsive-overflow o-table--sortable'})

# links = My_table.findAll('a')

# events = []
Example #19
0
def test_same_ordering(datapath):
    filename = datapath('io', 'data', 'valid_markup.html')
    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
    assert_framelist_equal(dfs_lxml, dfs_bs4)
 def read_file_like(self, f, encoding):
     with open(f, 'rb') as fobj:
         return read_html(BytesIO(fobj.read()), encoding=encoding,
                          index_col=0)
Example #21
0
#!/usr/bin/env python3

# date: 2020.01.18
#

from pandas.io.html import read_html

url = 'https://en.wikipedia.org/wiki/List_of_Game_of_Thrones_episodes'

wikitables = read_html(
    url,
    index_col=0,
    attrs={"class": "wikitable plainrowheaders wikiepisodetable"})

print("Extracted {num} wikitables".format(num=len(wikitables)))

for i, dataframe in enumerate(wikitables):
    dataframe.to_csv('file{}.csv'.format(i))
Example #22
0
def test_bs4_version_fails(monkeypatch):
    import bs4
    monkeypatch.setattr(bs4, '__version__', '4.2')
    with tm.assert_raises_regex(ValueError, "minimum version"):
        read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4')
Example #23
0
    #     break
    # else:
    #     continue
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
x=True
j = 2
i = 1
master_df = pd.DataFrame()
from selenium.common.exceptions import NoSuchElementException
while x:
    try:
        if driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]/h2[{i}]').text == seven_days_ago:
            print('end')
            break

        table = driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]')
        table_html = table.get_attribute('innerHTML')

        df = read_html(table_html)[i-1]
        df['DATE']=driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]/h2[{i}]').text
        i+=1

    except NoSuchElementException:
        j+=1
        if i!=1:
            i=1
        table = driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]')
        df = read_html(table_html)[i-1]
        df['DATE'] = driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]/h2[{i}]').text
    master_df=pd.concat([df,master_df])
from pandas.io.html import read_html
import pandas
import csv

page = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
wikitables = read_html(page, attrs={"class": "wikitable"})
te = wikitables[0].to_csv()

#print(wikitables)
#print("extracted {} number of tables".format(len(wikitables)))
#print(wikitables[0].shape)
#wikitables[0]".shape" the .shape will give your dimensions of rows and columns. ".tail" will give you last 4 and .header is the first 4

with open('snp500.csv') as S:
    reader = csv.DictReader(S)
    #print(reader)
    candles = list(reader)

csv_file = open('snp500.csv', 'w')
csv_writer = csv.writer(csv_file)

csv_writer.writerow(['ticker', 'name'])
for item in candles:
    csv_writer.writerow([item['Symbol'], item['Security']])

csv_file.close()
from bs4 import BeautifulSoup
import requests
import html5lib
from pandas.io.html import read_html
import unicodedata

# Initalize empty lists for dataframes of both stat types
traditional_stats = []
advanced_stats = []
# Loop through the two stat pages (traditional and advanced) for the relevant years
for year in range(2012, 2021):
    # Get traditional and advanced urls
    trad_url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
    adv_url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
    # Read in both stat tables
    trad_table = read_html(trad_url, attrs={"class": "stats_table"})[0]
    adv_table = read_html(adv_url, attrs={"class": "stats_table"})[0]
    # Add year column to discern season by season
    trad_table['Date'] = year
    adv_table['Date'] = year
    traditional_stats.append(trad_table)
    advanced_stats.append(adv_table)

# Initiate list of empty dataframes for salaries
salaries = []
for year in range(2012, 2021):
    # Get base web page
    salary_url = f"http://www.espn.com/nba/salaries/_/year/{year}/seasontype/1"
    page = requests.get(salary_url).text
    soup = BeautifulSoup(page, 'html5lib')
    # Get the number of pages for each year
Example #26
0
def test_same_ordering():
    _skip_if_none_of(["bs4", "lxml", "html5lib"])
    filename = os.path.join(DATA_PATH, "valid_markup.html")
    dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"])
    dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"])
    assert_framelist_equal(dfs_lxml, dfs_bs4)
Example #27
0
    makedirs(dp)
except OSError:
    pass  # dir exists

# output files
hiphen_affix_path = '%s/medaffix_with_hiphens.txt' % dp
affix_path = '%s/medaffix.txt' % dp
suffix_path = '%s/medsuffix.txt' % dp
prefix_path = '%s/medprefix.txt' % dp

# Wikipedia URL
url = 'http://en.wikipedia.org/wiki/List_of_medical_roots,_' \
      'suffixes_and_prefixes'

# parsed tables at this URL
tables = read_html(url, attrs={'class': 'wikitable'}, header=0)

# names of interesting columns in the tables
regular_keys = [
    'Affix',
    'Greek root in English',
    'Latin root in English',
    'Other root in English'
]

# former names of interesting columns
# in case they are restored in the future
ignoramus_keys = [
    'Preffix or suffix',
    'Preffix/suffix'
]
Example #28
0
 def run_read_html(self, *args, **kwargs):
     self.flavor = ['lxml']
     self.try_skip()
     kwargs['flavor'] = kwargs.get('flavor', self.flavor)
     return read_html(*args, **kwargs)
 def read_html(self, *args, **kwargs):
     kwargs['flavor'] = kwargs.get('flavor', self.flavor)
     return read_html(*args, **kwargs)
Example #30
0
 def read_filename(self, f, encoding):
     return read_html(f, encoding=encoding, index_col=0)
Example #31
0
 def read_string(self, f, encoding):
     with open(f, 'rb') as fobj:
         return read_html(fobj.read(), encoding=encoding, index_col=0)
def to_dataframe(r):
  return read_html(r.text, infer_types=False, header=0)[0]
data = db['data']

counter = 0

#Capturing data for male dogs 
categories= ['Origin', 'Height', 'Weight', 'Color', 'Coat', 'AKC', 'FCI', 'ANKC', 'CKC', 'KC (UK)', 'NZKC', 'UKC']

with open('dogdata/urls.txt') as f:
    lst = f.read().splitlines() 

for url in lst:
    d = defaultdict(str)
    print(url)
    counter += 1
    try:
        infobox = read_html(url, index_col=0, attrs={"class":"infobox biota"})
        dog_breed = url.replace('https://en.wikipedia.org/wiki/','').replace('_(dog)', '').replace('_',' ')
        d['Breed'] = dog_breed
        success = True
    except:
        print("Error: " , sys.exc_info()[0])
        success = False
        continue
    if success:
        for cat in categories:
            try:
                if (cat == 'Color'):
                    d[cat] = infobox[0].xs(cat).values[0][0]
                elif (cat == 'Weight'): # unnecessary due to additional dataset on height and weight
                    if infobox[0].xs(cat).values[0][0] == 'Male':
                        d[cat] = infobox[0].xs(cat).values[0][1]
Example #34
0
    def GetAccountDataFromSinaBase(self,
                                   code,
                                   year,
                                   dataArr,
                                   table_type='zcfzb'):
        '''
        table_type :"zcfzb","lrb","llb","fhpg"
        Bbase function for getting account data from sina
        add try catch block to avoid exception.
        http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/600519/ctrl/part/displaytype/4.phtml
        http://money.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/600519/ctrl/2019/displaytype/4.phtml
        '''
        if (table_type == ''):
            table_type = 'zcfzb'
            Id = "BalanceSheetNewTable0"
            FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml'
            furl = FINIANCE_SINA_URL % (code, year)  #获取数据,标准处理方法
        # zcfzb id="BalanceSheetNewTable0"
        # FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml'
        # lrb id="ProfitStatementNewTable0"
        # FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_ProfitStatement/stockid/%s/ctrl/%s/displaytype/4.phtml'
        # llb id="ProfitStatementNewTable0"
        # FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_CashFlow/stockid/%s/ctrl/%s/displaytype/4.phtml'
        if (table_type == 'zcfzb'):
            Id = "BalanceSheetNewTable0"
            FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml'
            furl = FINIANCE_SINA_URL % (code, year)  #获取数据,标准处理方法
        if (table_type == 'lrb'):
            Id = "ProfitStatementNewTable0"
            FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_ProfitStatement/stockid/%s/ctrl/%s/displaytype/4.phtml'
            furl = FINIANCE_SINA_URL % (code, year)  #获取数据,标准处理方法
        if (table_type == 'llb'):
            Id = "ProfitStatementNewTable0"
            FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_CashFlow/stockid/%s/ctrl/%s/displaytype/4.phtml'
            furl = FINIANCE_SINA_URL % (code, year)  #获取数据,标准处理方法
        if (table_type == 'fhpg'):
            Id = "sharebonus_1"
            FINIANCE_SINA_URL = 'http://money.finance.sina.com.cn/corp/go.php/vISSUE_ShareBonus/stockid/%s.phtml'
            furl = FINIANCE_SINA_URL % (code)  #获取数据,标准处理方法
        getH = RandomHeader()
        headers = getH.GetHeader()
        request = urllib2.Request(furl, headers=headers)
        text = urllib2.urlopen(request, timeout=5).read()
        text = text.decode('gbk')
        html = lxml.html.parse(StringIO(text))  #分离目标数据
        # res = html.xpath("//table[@id=\"BalanceSheetNewTable0\"]")#ProfitStatementNewTable0

        res = html.xpath(("//table[@id=\"%s\"]") % Id)
        sarr = [etree.tostring(node).decode('gbk') for node in res]  #存储文件
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>' % sarr  #向前滚动一年
        # year-=1
        #对最后一页进行判断,依据是数据是否有
        #将数据读入到dataframe数据个数中;并进行连接;

        df = read_html(sarr)[0]
        df.columns = range(0, df.shape[1])
        df = df.set_index(df.columns[0])
        dataArr = [dataArr, df]
        # dataArr = pd.concat(dataArr, axis=1, join='inner')
        dataArr = pd.concat(dataArr, axis=1)
        return dataArr
Example #35
0
 def test_bool_header_arg(self):
     # GH 6114
     for arg in [True, False]:
         with pytest.raises(TypeError):
             read_html(self.spam_data, header=arg)
Example #36
0
def _run_read_html(*args, **kwargs):
    _skip_if_no_parser()
    return read_html(*args, **kwargs)
Example #37
0
def test_invalid_flavor():
    url = 'google.com'
    with pytest.raises(ValueError):
        read_html(url, 'google', flavor='not a* valid**++ flaver')
Example #38
0
def game(hometeam, awayteam, week, year):
	"""Download, parse, and clean the spreads & over-under tables for one game.

	The columns are pinnacle, betonline, bookmaker each with suffix _spread or
	_over_under; datetime; hometeam, awayteam, favored; week. The first three
	are the bookies and give the spreads from the point of view of the favored
	team (so they're generally nonpositive).
	"""
	with urlopen(spread_url(hometeam, awayteam, week, year)) as connection:
		spreads_page = connection.read()
	# Note that infer_types is deprecated and won't work starting in Pandas 0.14
	LOG.debug('Getting game %s', (hometeam, awayteam, week, year))
	sp = read_html(io=spreads_page.decode('utf-8'),
					 match="History", attrs={'id': 'table-000'},
					 infer_types=False, header=0,
					 skiprows=[1, 2, 3])
	if len(sp) != 1:
		raise CantFindTheRightTable
	sp = sp.pop()

	# Get the over-under page
	ou = read_html(io=over_under_url(hometeam, awayteam, week, year),
				   match="History", attrs={'cellspacing': 0},
				   infer_types=False, header=0,
				   skiprows=[1, 2, 3])
	if len(ou) != 1:
		raise CantFindTheRightTable
	ou = ou.pop()

	# Cleaning.
	for t, name, date_col in (sp, 'spread', 'Unnamed: 0'), (ou, 'over_under', '\xa0'):
		datetime = pd.to_datetime(
			t[date_col]
			.replace(r'(\d\d?/\d\d?)', r'\1/%d' % year, regex=True)
			.replace(r'(01|02)/(\d\d?)/\d{4}', r'\1/\2/%d' % (year + 1),
					 regex=True))
		del t[date_col]

		# Replace all the '--' as missing so we can convert numbers to floats.
		for column in t.keys():
			t[column] = (t[column]
						 .replace('--', 'nan')
						 .replace('(Pick)', 0)
						 .apply(float))

		# Add datetime back in after the str-to-float conversion so we don't do
		# it for the datetime.
		t['datetime'] = datetime

		# Lowercase column names for ease of programming later
		t.columns = [h.lower() for h in t.columns]

		# Give spreads/over-under their suffixes
		for col in 'pinnacle', 'betonline', 'bookmaker':
			t[col + '_' + name] = t[col]
			del t[col]

	data = sp.merge(ou, on=['datetime'], how='outer')
	assert set(data.datetime) == (set(sp.datetime) | set(ou.datetime))

	# Add this function's arguments to the table.
	data['hometeam'] = hometeam
	data['awayteam'] = awayteam
	data['week'] = week

	# Get favored team from the big "Odds: Washington by 4," that shows up at the
	# top of the page.
	soup = BeautifulSoup(spreads_page)
	subheader = soup.find('p', attrs={'class': 'h1-sub'}).find('strong')
	m = _FAVORED_RE.search(subheader.contents[0])
	if m is None or not m.group('city'):
		raise ValueError("Couldn't figure out who was favored: %r" %
						 (subheader.contents))
	city = m.group('city').replace(' ', '-').replace('.', '').lower()
	# city will be something like 'san-francisco' after the transformations
	# above. Find what team that is by looking for the links to the teams that
	# are also in that subheader.
	for link in subheader.findAll('a'):
		link = link['href']
		if city in link:
			data['favored'] = link.split('-')[-1]
			break
	else:
		raise ValueError("couldn't figure out who %s is" % city)

	return data
Example #39
0
 def read_html(self, *args, **kwargs):
     kwargs.setdefault('flavor', self.flavor)
     return read_html(*args, **kwargs)
Example #40
0
for x in list:
    try:
        matchup_df = matchup_df.append(matchup_scraper(str(x)))
    except:
        break

## Calculate season stats and record from matchup stats in R
driver.get(
    'https://www.fantrax.com/fantasy/league/8i8nwftijzzq6mwq/standings?startDate=2019-10-02&endDate=2020-04-04&hideGoBackDays=true&period=5&timeStartType=PERIOD_ONLY&timeframeType=YEAR_TO_DATE&view=SEASON_STATS&pageNumber=1'
)
time.sleep(5)
table = driver.find_element_by_xpath(
    '/html/body/app-root/div/div[1]/div/app-league-standings/div/section/league-standings-tables/div/div[2]/ultimate-table/div/section/div'
)
table_html = table.get_attribute('innerHTML')
season_df = read_html(table_html)[0]

teams = driver.find_element_by_xpath(
    '/html/body/app-root/div/div[1]/div/app-league-standings/div/section/league-standings-tables/div/div[2]/ultimate-table/div'
)
teams_html = teams.get_attribute('innerHTML')
teams = re.findall("</figure>.*?<!---->", teams_html)
# categories = re.findall('">.*?</a></th>', teams_html)
for x in range(0, len(teams)):
    teams[x] = teams[x][10:]
    teams[x] = teams[x][:-8]
# for x in range(0,len(categories)):
#     categories[x] = re.findall(';">.*?</a></th>',categories[x])
#     categories[x] = str(categories[x])[6:]
#     categories[x] = categories[x][:-11]
# season_df.columns = categories
Example #41
0
def test_invalid_flavor():
    url = "google.com"
    with tm.assertRaises(ValueError):
        read_html(url, "google", flavor="not a* valid**++ flaver")
Example #42
0
def season_games(year):
	"""Download, parse, and clean a table of games and scores for given season.

	The columns are week; hometeam; awayteam; winner; date; points, yards, and
	turn overs for the winning team; points, yards, and turn overs for the
	losing team; and season.
	"""
	LOG.debug('Getting season %d', year)
	data = read_html(io=season_games_url(year),
					  attrs={'id': 'games'},
					  infer_types=False,
					  header=0)
	if len(data) != 1:
		raise CantFindTheRightTable
	data = data.pop()

	# Cleaning.
	del data["Unnamed: 3"]
	# The code below issues "UserWarning: " So we catch UserWarnings.
	with warnings.catch_warnings():
		warnings.filterwarnings(action='ignore', category=UserWarning,
								module=r'pandas\.core\.frame',
								message=(r"Boolean Series key will be reindexed"
										 r" to match DataFrame index\."))
		# These rows are mid-table header rows.
		data = data[data.Week != "Week"][data.Week != "nan"]

	data['week'] = (data.Week
					.replace("WildCard", "wild-card")
					.replace("Division", "divisional")
					.replace("ConfChamp", "conference")
					.replace("SuperBowl", "super-bowl")
					.apply(
						lambda s: (int(s)
								   if all(c in '1234567890' for c in s)
								   else s)))
	del data['Week']

	data['season'] = year
	data['game_date'] = pd.to_datetime(
		data.Date
		.replace(r"$", r", %d" % year, regex=True)
		.replace(r"^(January|February) (\d+), \d+$", r"\1 \2, %d" % (year + 1),
				 regex=True))
	del data['Date']

	for column in "PtsW", "PtsL", "YdsW", "TOW", "YdsL", "TOL":
	    data[column] = data[column].apply(int)

	data['WatL'] = data['Unnamed: 5'].apply(lambda x: x == '@')
	del data['Unnamed: 5']
	data['hometeam'] = (~data.WatL * data['Winner/tie'] +
						data.WatL * data['Loser/tie'])
	data['awayteam'] = (data.WatL * data['Winner/tie'] +
						~data.WatL * data['Loser/tie'])
	data['winner'] = data['Winner/tie']
	for column in 'Winner/tie', 'Loser/tie', "WatL":
		del data[column]
	for column in 'hometeam', 'awayteam', 'winner':
		data[column] = data[column].apply(lambda s: s.split()[-1].lower())

	return data
Example #43
0
#finalamt = usd_to_pkr(float(rate),float(gbp))
#print(str(rupees)+" USD is equvalent to "+str(finalamt)+" PKR")
# url = "https://www.lme.com/"
#
#
# page = requests.get(url)
#
# tables = read_html(page.text, attrs={"class":"ring-times"})
# print(tables[0].head())
# tables[0].to_excel("df.xlsx")

url = "https://www.lme.com/en-GB/Metals/Non-ferrous#tabIndex=0"

page = requests.get(url)

tables = read_html(page.text)
print(tables[0].head())
tables[0]['Zinc'] = tables[0]['Zinc'].apply(lambda x: x * float(rate))
tables[0].to_excel("df.xlsx")
#
# soup = bs(url.content, 'html.parser')
#
# filename = 'test.csv'
#
# csv_writer = csv.writer (open(filename, 'w'))
#
# heading = soup.find('h2')
# # table = soup.find_all("table")
#
# for tr in soup.find_all('tr'):
#     data = []
Example #44
0
def test_invalid_flavor():
    url = 'google.com'
    with tm.assertRaises(ValueError):
        read_html(url, 'google', flavor='not a* valid**++ flaver')
 def read_filename(self, f, encoding):
     return read_html(f, encoding=encoding, index_col=0)
Example #46
0
def test_same_ordering():
    _skip_if_none_of(['bs4', 'lxml', 'html5lib'])
    filename = os.path.join(DATA_PATH, 'valid_markup.html')
    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
    assert_framelist_equal(dfs_lxml, dfs_bs4)
 def read_string(self, f, encoding):
     with open(f, 'rb') as fobj:
         return read_html(fobj.read(), encoding=encoding, index_col=0)
Example #48
0
def import_yahoo(symbol):
    equity_data = pd.DataFrame()
    if symbol == 'Brent':
        i = 0
        for i in range(0, 2):
            try:
                # driver = webdriver.Chrome(options=chrome_options)
                # driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
                # driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=chrome_options)
                driver = webdriver.Chrome(
                    executable_path=os.environ.get("CHROMEDRIVER_PATH"),
                    chrome_options=chrome_options)
                url = "https://markets.businessinsider.com/commodities/historical-prices/oil-price/usd?type=brent"
                driver.get(url)
                time.sleep(3)
                table = driver.find_element_by_xpath(
                    '//*[@id="historic-price-list"]/div/div[2]/table/..')
                #table = driver.find_element_by_xpath('//*[@id="historic-price-list"]/div/div[2]/table/')
                table_html = table.get_attribute('innerHTML')
                equity_data = read_html(table_html)[0]
                equity_data = equity_data.set_index(
                    pd.DatetimeIndex(equity_data['Date'])).drop(
                        ['Date'], axis=1).rename_axis('trade_date')
                equity_data = equity_data.rename(
                    columns={"Closing Price": "close"})
                equity_data = equity_data['close'].reset_index().set_index(
                    'trade_date')
                driver.quit()
            except:
                driver.quit()
                i += 1
                print(f"Still trying {2-i} more times.")

    else:
        equity_data = yf.download(
            symbol,
            start=(datetime.today() - dateutil.relativedelta.relativedelta(
                months=28)).strftime('%Y-%m-%d'),
            end=datetime.today().strftime('%Y-%m-%d')).rename_axis(
                'trade_date')
        equity_data = equity_data.rename(columns={"Close": "close"})
        equity_data = equity_data['close'].reset_index().set_index(
            'trade_date')

    if not equity_data.empty:
        # First part of the insert statement
        insert_init = """insert into equity_history
                (trade_date, ticker, close)
                values
                """
        # Add values for all days to the insert statement
        if symbol == 'BZ':
            symbol = 'Brent'
        vals = ",".join([
            """('{}', '{}', '{}')""".format(str(trade_date), symbol, row.close)
            for trade_date, row in equity_data.iterrows()
        ])

        # Handle duplicates - Avoiding errors if you've already got some data in your table
        insert_end = """ on duplicate key update
            close=close;"""

        # Put the parts together
        query = insert_init + vals + insert_end

        # Fire insert statement
        engine.execute(query)
Example #49
0
 def test_bool_header_arg(self):
     # GH 6114
     for arg in [True, False]:
         with tm.assertRaises(TypeError):
             read_html(self.spam_data, header=arg)
Example #50
0
def test_bs4_version_fails(monkeypatch, datapath):
    import bs4
    monkeypatch.setattr(bs4, '__version__', '4.2')
    with tm.assert_raises_regex(ValueError, "minimum version"):
        read_html(datapath("io", "data", "spam.html"), flavor='bs4')
Example #51
0
#import packages
import requests
from bs4 import BeautifulSoup
import lxml.html as lh
import pandas as pd
import openpyxl
from selenium import webdriver
from pandas.io.html import read_html
#start getting tables
#Overall table
driver = webdriver.Safari()
driver.get('https://fbref.com/en/comps/9/Premier-League-Stats')
table = driver.find_element_by_id('div_results32321_overall')
table_html = table.get_attribute('innerHTML')
lt_df = read_html(table_html)[0]
#lt_df.columns = lt_df.columns.get_level_values(1)
#driver.close()
#Home Away Table
#driver = webdriver.Safari()
#driver.get('https://fbref.com/en/comps/9/Premier-League-Stats')
table = driver.find_element_by_id('div_results32321_home_away')
table_html = table.get_attribute('innerHTML')
homeaway_df = read_html(table_html)[0]
homeaway_df.columns = homeaway_df.columns.get_level_values(1)
#driver.close()
#Squads table
#driver = webdriver.Safari()
#driver.get('https://fbref.com/en/comps/9/Premier-League-Stats')
table = driver.find_element_by_id('div_stats_standard_squads')
table_html = table.get_attribute('innerHTML')
squads_df = read_html(table_html)[0]
Example #52
0
 def read_file_like(self, f, encoding):
     with open(f, 'rb') as fobj:
         return read_html(BytesIO(fobj.read()),
                          encoding=encoding,
                          index_col=0)
Example #53
0
 def read_html(self, *args, **kwargs):
     kwargs['flavor'] = self.flavor
     return read_html(*args, **kwargs)
Example #54
0
import wikipedia as wiki
import string
import pandas as pd
from pandas.io.html import read_html

seedpage = wiki.page("List of United States cities by population")

url = seedpage.url

wikitables = read_html(url, attrs={"class": "wikitable sortable"})

print("Extracted {num} wikitables".format(num=len(wikitables)))
print(url)
table_df = wikitables[0]


def removecite(x):
    ''' Simple fucntion to remove citations from scraped table'''
    if type(x) == str:
        return x.partition('[')[0]
    else:
        return x


def getsummary(x):
    summaries = []
    for city in list(x):
        try:
            summary = wiki.page(city, auto_suggest=True, redirect=True).summary
        except (wiki.exceptions.DisambiguationError) as e:
            summary = "Summary not fetched due to disambiguation"
Example #55
0
 def read_html(self, *args, **kwargs):
     kwargs.setdefault('flavor', self.flavor)
     return read_html(*args, **kwargs)
Example #56
0
def test_bs4_version_fails(monkeypatch, datapath):
    import bs4

    monkeypatch.setattr(bs4, "__version__", "4.2")
    with pytest.raises(ImportError, match="Pandas requires version"):
        read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4")
Example #57
0
def test_bs4_version_fails(monkeypatch, datapath):
    import bs4
    monkeypatch.setattr(bs4, '__version__', '4.2')
    with pytest.raises(ValueError, match="minimum version"):
        read_html(datapath("io", "data", "spam.html"), flavor='bs4')
Example #58
0
def test_same_ordering(datapath):
    filename = datapath("io", "data", "html", "valid_markup.html")
    dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"])
    dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"])
    assert_framelist_equal(dfs_lxml, dfs_bs4)
Example #59
0
def test_same_ordering(datapath):
    filename = datapath('io', 'data', 'valid_markup.html')
    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
    assert_framelist_equal(dfs_lxml, dfs_bs4)
Example #60
0
 def read_html(self, *args, **kwargs):
     kwargs['flavor'] = kwargs.get('flavor', self.flavor)
     return read_html(*args, **kwargs)