コード例 #1
0
    def fillTournaments(self, myAPIServer):
        server = myAPIServer + 'data/tournament'

        page = 0
        while page <= 5400:
            try:
                xhtml = self.url_get_contents(
                    'https://www.cagematch.net/?id=26&s=' +
                    str(page)).decode('utf-8')
                page = page + 100
                p = HTMLTableParser()
                p.feed(xhtml)
                table = p.tables[0]
                table.pop(0)

                for title in table:
                    data = self.Tournament(0, title[2], 'FIFA', None).toJSON()
                    response = requests.post(server,
                                             data=data,
                                             headers=headers)
                    FillSportAndTournaments.lastIdTournament = int(
                        response.text)
                    print 'Tournament with id ' + response.text + ' was added.'
            except:
                pass
コード例 #2
0
ファイル: Sample.py プロジェクト: ajk6204/ProjectPickle
def main():
    url = 'https://finance.yahoo.com/quote/AAPL/history/'
    xhtml = url_get_contents(url).decode('utf-8')

    p = HTMLTableParser()
    p.feed(xhtml)
    pprint(p.tables)
コード例 #3
0
def obtain_parse_wiki_stocks_sp500(url):
  """Download and parse the Wikipedia list of S&P500 
  constituents using requests and libxml.

  Returns a list of tuples for to add to MySQL."""

  # Get S&P500 website content
  req = urllib.request.Request(url)
  response = urllib.request.urlopen(req)
  data = response.read().decode('utf-8')
  
  #Instantiate the parser and feed it
  p = HTMLTableParser()
  p.feed(data)
  table_list = p.tables
  table = table_list[0][1:]

  # Obtain the symbol information for each row in the S&P500 constituent table
  symbols = []
  for row in table:
    sd = {'ticker': row[0],
        'name': row[1],
        'sector': row[3]}
    # Create a tuple (for the DB format) and append to the grand list
    symbols.append(sd['ticker'])
  return symbols
コード例 #4
0
    def run(self, params={}):
        p = HTMLTableParser()
        p.feed(params.get(Input.TAP_ALERT))
        data = p.tables
        clean_data = TAP(data).data

        return {Output.RESULTS: clean_data}
コード例 #5
0
def main():
    url = 'https://www.investagrams.com/Stock/Analysis/TechnicalAnalysis'
    xhtml = url_get_contents(url).decode('utf-8')

    p = HTMLTableParser()
    p.feed(xhtml)
    pprint(p.tables)
コード例 #6
0
def check_new_data():
    date_from = (datetime.utcnow() - timedelta(days=5)).strftime("%d.%m.%Y")
    date_to = datetime.utcnow().strftime("%d.%m.%Y")
    resp_body = req.smsStat(date_from, date_to, config.child_id)
    print(resp_body)
    if resp_body != 0:
        soup = BeautifulSoup(resp_body, "html.parser")
        p = HTMLTableParser()
        try:
            sms_tab = soup.find('table').find("table")
            p.feed(str(sms_tab))
        except:
            print("not found element on page")
            print("resp_body:")
            print(resp_body)
        try:
            sms_data = p.tables[0]
            print('sms_data = ', sms_data.pop(0))
        except IndexError:
            sms_data = []
            print('sorry, no data')
        for x in sms_data:
            db.ins_stat(x)
    else:
        print("Request error, skip...")
コード例 #7
0
def read_first_table(url):
    """ Read the given URL, parse the result, and return the first table.
    """
    xhtml = read_url(url)
    parser = HTMLTableParser()
    parser.feed(xhtml)
    return parser.tables[0]  # Use first table on the page
コード例 #8
0
def main():
    url = 'http://www.twitter.com'
    xhtml = url_get_contents(url).decode('utf-8')

    p = HTMLTableParser()
    p.feed(xhtml)
    pprint(p.tables)
コード例 #9
0
def dataInitialisation():
    xhtml = url_get_contents(
        'https://nssdc.gsfc.nasa.gov/planetary/factsheet/').decode('utf-8')
    p = HTMLTableParser()
    p.feed(xhtml)
    solarSystemData = p.tables[0]

    row_dict = {
        "mass": 1,
        "diameter": 2,
        "density": 3,
        "gravity": 4,
        "escape velocity": 5,
        "rotation period": 6,
        "length of day": 7,
        "distance from sun": 8,
        "perihelion": 9,
        "aphelion": 10,
        "orbital period": 11,
        "orbital velocity": 12,
        "orbital inclination": 13,
        "orbital eccentricity": 14,
        "obliquity to orbit": 15,
        "mean temperature": 16,
        "surface pressure": 17,
        "number of moons": 18,
        "ring system?": 19,
        "global magnetic field?": 20,
    }

    col_dict = {
        "mercury": 1,
        "venus": 2,
        "earth": 3,
        "moon": 4,
        "mars": 5,
        "jupiter": 6,
        "saturn": 7,
        "uranus": 8,
        "neptune": 9,
        "pluto": 10,
    }

    rocket_mass = {
        "pigeon 10": 549054,
        "pigeon 10 heavy": 549054,
        "big fat rocket": 3401942,
        "air captain": 14220,
        "air captain heavy": 733000,
    }

    rocket_thrust = {
        "pigeon 10": 7607000,
        "pigeon 10 heavy": 934000,
        "big fat rocket": 9900000,
        "air captain": 110000,
        "air captain heavy": 240000,
    }

    return [solarSystemData, row_dict, col_dict, rocket_mass, rocket_thrust]
コード例 #10
0
def access_elements(driver, proj_type, matchup_list):
    # parses the HTML and gets us the elements we need

    # get today's games
    today_games = driver.find_element_by_css_selector('.day:nth-child(1)')
    # the parser is in a separate module; this will instantiate it
    p = HTMLTableParser()
    # the parser needs the raw HTML, not the webObjects
    p.feed(today_games.get_attribute('innerHTML'))
    # iterate through the table rows
    for matchup in p.tables:
        # each row will print in the following format:
        # [['', '7 p.m. Eastern', 'Elo spread', 'Win prob.', 'Score'], [''], ['', '', 'Nuggets', '', '40%', '', ''], ['', '', 'Pacers', '-2.5', '60%', '', ''], ['']]
        # each matchup has 5 rows; we care about rows 3 & 4.
        # within each row, the name and win probability are what we need
        away_team_long = team_name_formatting(matchup[2][2])
        away_team_win_prob = string_percent_to_integer(matchup[2][4])
        home_team_long = team_name_formatting(matchup[3][2])
        home_team_win_prob = string_percent_to_integer(matchup[3][4])

        game_dict = {
            "game_key": away_team_long + home_team_long,
            "away_team": {
                "team_name": away_team_long,
                "win_prob": away_team_win_prob
            },
            "home_team": {
                "team_name": home_team_long,
                "win_prob": home_team_win_prob
            }
        }
        # add each game to the tracking list
        matchup_list.append(game_dict)

    return matchup_list
コード例 #11
0
def main():
    url = 'https://www.skysports.com/premier-league-table'
    xhtml = url_get_contents(url).decode('utf-8')

    #using html parser
    p = HTMLTableParser()
    p.feed(xhtml)
    pprint(p.tables)
コード例 #12
0
def get_open_countries():
    parser = HTMLTableParser(data_separator=', ')
    parser.feed(response.text)
    table = parser.tables[0]
    open_date = 'I know when countries open boarders! \n\n'
    for date, countries in table:
        open_date += f'{date}: {countries} \n'
    return open_date
コード例 #13
0
def table_response(payload):
    response = requests.post(
        "https://wish.wis.ntu.edu.sg/webexe/owa/AUS_SCHEDULE.main_display1",
        data=payload)
    parser = HTMLTableParser()
    parser.feed(response.text)
    table = parser.tables
    return (table)
コード例 #14
0
def main():
    url = 'http://www.rich.co.ke/rcdata/nsestocks.php'
    xhtml = url_get_contents(url).decode('utf-8')

    p = HTMLTableParser()
    p.feed(xhtml)
    #pprint(p.tables[14])
    df = pd.DataFrame(p.tables[14])
    df.to_csv('stocks.csv', index=False, encoding='utf-8')
コード例 #15
0
    def test_singleTable(self):
        input = self.getTestInput("single_table")

        uut = HTMLTableParser(decode_html_entities=False, data_separator=' ')
        uut.feed(input)
        actual = uut.tables

        self.assertEqual(1, len(actual))
        self.checkNumericTableValues(actual[0], 3, 10, 0)
コード例 #16
0
    def test_keepCharRefs(self):
        input = self.getTestInput("with_char_ref")

        uut = HTMLTableParser(decode_html_entities=False, data_separator=' ')
        uut.feed(input)
        actual = uut.tables

        self.assertEqual(1, len(actual))
        expected_values = ["", "", "", "", ""]
        self.checkTableValues(actual[0], 1, 5, expected_values)
コード例 #17
0
    def test_decodeCharRefs(self):
        test_input = self.getTestInput("with_char_ref")

        uut = HTMLTableParser(decode_html_entities=True, data_separator=' ')
        uut.feed(test_input)
        actual = uut.tables

        self.assertEqual(1, len(actual))
        expected_values = [">", "<", "$", "&", "="]
        self.checkTableValues(actual[0], 1, 5, expected_values)
コード例 #18
0
def laneBarcodeHTML(path):
    """Retrieve data from the laneBarcode.html page, the data is per barcode/sample per lane"""
    try:
        samples_dict = {}

        dict_samples = {
            'Lane': {'column': 'Lane'},
            'Project': {'column': 'Project'},
            'Sample_name': {'column': 'Sample'},
            'Barcode_sequence': {'column': 'Barcode sequence'},
            'PF_Clusters': {'column': 'PF Clusters'},
            'PCT_of_lane': {'column': '% of the lane'},
            'PCT_perfect_barcode': {'column': '% Perfect barcode'},
            'PCT_one_mismatch_barcode': {'column': '% One mismatch barcode'},
            'Yield_Mbases': {'column': 'Yield (Mbases)'},
            'PCT_PF_Clusters': {'column': '% PF Clusters'},
            'PCT_Q30_bases': {'column': '% = Q30 bases'},
            'Mean_Quality_Score': {'column': 'Mean Quality Score'}
        }

        samplehtml = commands.getoutput('find {path}/Data/Intensities/BaseCalls/Reports/html/*/all/all/all/ -iname \'laneBarcode.html\''.format(
            path=str(path)
            ))

        with open(samplehtml, 'r') as sample:
            html = sample.read()
            tableParser = HTMLTableParser()
            tableParser.feed(html)
            tables = tableParser.tables                         # tables[1]==run tables[2]==sample

            header_samplehtml = tables[2][0]

            for col in dict_samples:
                dict_samples[col]['index'] = header_samplehtml.index(dict_samples[col]['column'])

            for sample_lane in tables[2][1:]:
                data_sample_lane = {}
                if sample_lane[header_samplehtml.index('Project')].upper() != 'DEFAULT':
                    stats = [convert_numbers(item.replace(',', '')) for item in sample_lane]

                    lane = stats[header_samplehtml.index('Lane')]
                    sample = stats[header_samplehtml.index('Sample')]
                    lane_sample = str(lane) + '--' + str(sample)

                    for col in dict_samples:
                        stat = stats[dict_samples[col]['index']]
                        data_sample_lane[col] = stat

                    samples_dict[lane_sample] = data_sample_lane

        return samples_dict

    except Exception, e:
        print(e)
コード例 #19
0
    def test_twoTables(self):
        test_input = self.getTestInput("two_tables")

        uut = HTMLTableParser(decode_html_entities=False, data_separator=' ')
        uut.feed(test_input)
        actual = uut.tables

        self.assertEqual(2, len(actual))

        self.checkNumericTableValues(actual[0], 3, 10, 0)
        self.checkNumericTableValues(actual[1], 3, 10, 100)
コード例 #20
0
def get_match_players(html_page):
    match_players = set()
    players_tables_containers = html_page.find_class('col-soupisky-home') + html_page.find_class('col-soupisky-visitor')
    for tables_container in players_tables_containers:
        for table in tables_container.findall('table'):
            table_string = tostring(table, encoding='utf-8')
            table_parser = HTMLTableParser()
            table_parser.feed(table_string.decode('utf-8'))
            for row in table_parser.tables[0][1:]:
                if row[0]:
                    match_players.add(extract_player_name(row[2]))
    return match_players
コード例 #21
0
def main():
    url = 'https://w3schools.com/html/html_tables.asp'
    xhtml = url_get_contents(url).decode('utf-8')

    p = HTMLTableParser()
    p.feed(xhtml)

    # Get all tables
    pprint(p.tables)

    # Get tables with id attribute
    pprint(p.named_tables)
コード例 #22
0
def main(ip):
    s = NeSession()
    try:
        url = "http://" + ip + ":20080/EMSRequest/VoltageStatistics"
        re = s.get(url)
    except:
        url = "https://" + ip + "/EMSRequest/VoltageStatistics"
        re = s.get(url, verify=False)
    xhtml = re.text.strip()

    p = HTMLTableParser()
    p.feed(xhtml)
    return p.tables
コード例 #23
0
    def test_withInternalTags(self):
        input = self.getTestInput("tags_inside_cells")

        uut = HTMLTableParser(decode_html_entities=False, data_separator='-')
        uut.feed(input)
        actual = uut.tables

        self.assertEqual(1, len(actual))
        expected_values = [
            "0-zero", "1-one", "2-two", "3-three", "4-four", "5-five", "6-six",
            "7-seven", "8-eight", "9-nine-nuevo"
        ]
        self.checkTableValues(actual[0], 1, 10, expected_values)
コード例 #24
0
ファイル: tasks.py プロジェクト: petrgru/flask-remenarna
def data_web_update():

    from webb import webb
    from aplikace.models import Product
    #    from locale import atof
    import re
    id = 0
    from html_table_parser import HTMLTableParser
    for polozka in Product.notKL():
        url = "http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK"
        page = webb.download_page(url)
        p = HTMLTableParser()
        p.feed(page.decode('utf-8'))
        #print(p.tables)
        ar = p.tables
        try:
            data = Product.find_by_Obj(polozka.Obj)
            for i in range(6, 10):

                if re.search('technical-data', ar[0:1][0][1][i]):
                    data.TL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('product-lists', ar[0:1][0][1][i]):
                    data.KL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('pics', ar[0:1][0][1][i]):
                    data.Foto = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
            data.sklad = ar[0:1][0][1][3]
            if ar[0:1][0][1][11]:
                data.Poznamka = ar[0:1][0][1][11]
            #print data.Obj
            data.update(commit=False)
            id = id + 1
            if id % 100 == 0:
                print "aktualizuji data"
                db.session.commit()


#        for i in ar[0:1][0][1]:db.session.commit()rint
#                print(i)
#print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",",".")))

        except:

            print "Chyba" + str(id) + " " + polozka.Obj
            db.session.commit()
            #data_web_update.delay()

    db.session.commit()
    return True
コード例 #25
0
ファイル: models.py プロジェクト: jash6/StockGuru
 def get_live_data(self, stock):
     url = 'https://in.finance.yahoo.com/quote/' + stock + '.NS/history/'
     html = self.get_contents(url).decode('utf-8')
     parser = HTMLTableParser()
     parser.feed(html)
     livedata = pd.DataFrame(parser.tables[0],
                             columns=parser.tables[0][0])[[
                                 'Date', 'Close*'
                             ]].rename(columns={'Close*': 'Close'})
     livedata.drop(livedata.index[[0, -1]], inplace=True)
     livedata['Date'] = pd.to_datetime(livedata['Date']).dt.date
     livedata = livedata.reindex(index=livedata.index[::-1])
     livedata.dropna(inplace=True)
     return livedata
コード例 #26
0
ファイル: check.py プロジェクト: Singizin/priemka
def parse(url):
    abit = []
    xhtml = url_get_contents(url).read().decode('utf-8')
    p = HTMLTableParser()
    a = p.feed(xhtml)
    new = list(p.tables)
    del new[0]  # удалить окно поиска на сайте нгту
    del new[0][0:3]  # удалить шапку талицы
    # print(new[0])
    # print(len(new[0]))
    for i in new[0]:
        #print(len(i))
        abit.append(i[1] if len(i) > 1 else i[0])
    return abit
コード例 #27
0
ファイル: tasks.py プロジェクト: petrgru/flask-remenarna
def data_web_update():

    from webb import webb
    from aplikace.models import Product
#    from locale import atof
    import re
    id=0
    from html_table_parser import HTMLTableParser
    for polozka in Product.notKL():
        url="http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK"
        page = webb.download_page(url)
        p = HTMLTableParser()
        p.feed(page.decode('utf-8'))
        #print(p.tables)
        ar=p.tables
        try:
            data=Product.find_by_Obj(polozka.Obj)
            for i in range(6,10):

                if re.search('technical-data',ar[0:1][0][1][i]):
                    data.TL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('product-lists',ar[0:1][0][1][i]):
                    data.KL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('pics',ar[0:1][0][1][i]):
                    data.Foto = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
            data.sklad = ar[0:1][0][1][3]
            if ar[0:1][0][1][11]:
                data.Poznamka = ar[0:1][0][1][11]
            #print data.Obj
            data.update(commit=False)
            id=id+1
            if  id % 100 == 0:
                print "aktualizuji data"
                db.session.commit()
#        for i in ar[0:1][0][1]:db.session.commit()rint
#                print(i)
            #print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",",".")))

        except:

            print "Chyba" + str(id) + " " + polozka.Obj
            db.session.commit()
            #data_web_update.delay()
            
    db.session.commit()
    return True
コード例 #28
0
def get_payers_scored(html_page):
    score_tables = html_page.find_class('table-last-right')

    players_scored = set()
    for table in score_tables:
        for redundant_el in table.find_class('row-plus-minus'):
            redundant_el.getparent().remove(redundant_el)
        table_string = tostring(table, encoding='utf-8')
        table_parser = HTMLTableParser()
        table_parser.feed(table_string.decode('utf-8'))
        for row in table_parser.tables[0][1:]:
            if len(row) > 3:
                players_scored.add(extract_player_name(row[2]))

    return players_scored
コード例 #29
0
def parse_devicelist(data_str):
    """Parse the BT Home Hub 5 data format."""

    p = HTMLTableParser()
    p.feed(data_str)

    known_devices = p.tables[9]

    devices = {}

    for device in known_devices:
        if len(device) == 5 and device[2] != '':
            devices[device[2]] = device[1]

    return devices
コード例 #30
0
    def test_namedTables(self):
        input = self.getTestInput("named_tables")

        uut = HTMLTableParser(decode_html_entities=False, data_separator=' ')
        uut.feed(input)
        actual = uut.tables
        actual_named = uut.named_tables

        self.assertEqual(2, len(actual))
        self.assertEqual(2, len(actual_named))

        self.checkNumericTableValues(actual[0], 3, 10, 0)
        self.checkNumericTableValues(actual[1], 3, 10, 100)

        self.checkNumericTableValues(actual_named["named_table_one"], 3, 10, 0)
        self.checkNumericTableValues(actual_named["named_table_two"], 3, 10,
                                     100)
コード例 #31
0
    def run(self, params={}):
        p = HTMLTableParser()
        p.feed(params.get(Input.TAP_ALERT))
        data = p.tables
        clean_data = TAP(data).data

        # Get the Threat details URL which is NOT an HTML table element, but instead the <a> link of the
        #    table element
        extractor = URLExtract()
        cleaned_input_for_extractor = params.get(Input.TAP_ALERT)
        cleaned_input_for_extractor.replace('\n', '')
        urls_from_input = extractor.find_urls(cleaned_input_for_extractor)
        threat_details_urls = list(filter(lambda u: r'threat/email' in u and r'threatinsight.proofpoint.com' in u[:40],
                                          urls_from_input))
        if threat_details_urls:
            clean_data['threat']['threat_details_url'] = threat_details_urls[0]

        return {Output.RESULTS: clean_data}
コード例 #32
0
    def test_nestedTable(self):
        test_input = self.getTestInput("nested_table")

        uut = HTMLTableParser(decode_html_entities=False, data_separator=' ')
        uut.feed(test_input)
        actual = uut.tables

        self.assertEqual(2, len(actual))

        nested_table = actual[0]
        self.checkNumericTableValues(nested_table, 2, 3, 0)

        outer_table = actual[1]
        expected_first_row = ['', '1', '2']
        expected_second_row = ['10', '11', '12']
        self.assertEqual(2, len(outer_table))
        self.assertListEqual(outer_table[0], expected_first_row)
        self.assertListEqual(outer_table[1], expected_second_row)
コード例 #33
0
    def update(self):
        _LOGGER.debug('Updating Waste collection dates using scraper')

        try:
            today = datetime.today()
            year = today.strftime('%Y')
            suffix_url = self.postcode + "/" + self.street_number + "/" + year + "/"
            url = "https://gemeente.groningen.nl/afvalwijzer/groningen/" + suffix_url
            req = urllib.request.Request(url=url)
            f = urllib.request.urlopen(req)
            xhtml = f.read().decode('utf-8')
            p = HTMLTableParser()
            p.feed(xhtml)
            waste_dict = {}
            fraction_name = ""
            if p.tables[0]:
                for table_row in p.tables[0][1:]:
                    for i in range(13):
                        if table_row[i]:
                            if i == 0:
                                fraction_name = table_row[i].split("  ")[0]
                                if "Klein chemisch afval kunt u" in fraction_name:
                                    fraction_name = "Klein chemisch afval"
                                waste_dict[fraction_name] = []
                            else:
                                for day in table_row[i].split(" "):
                                    try:
                                        waste_dict[fraction_name].append(
                                            datetime.strptime(
                                                (day.replace("*", "") + " " +
                                                 str(i) + " " + year),
                                                "%d %m %Y"))
                                    except (ValueError, TypeError):
                                        pass
                self.data = waste_dict
            else:
                _LOGGER.error(
                    'Error occurred while fetching data. Probably the postcode/street number is incorrect.'
                )
                self.data = None
        except urllib.error.URLError as exc:
            _LOGGER.error('Error occurred while fetching data: %r', exc.reason)
            self.data = None
            return False
コード例 #34
0
def get_players_data_by_their_stats(players_data_url, headers):
    if isinstance(players_data_url, list):
        urls_to_process = players_data_url
    else:
        urls_to_process = [players_data_url]

    result_df = None
    for url in urls_to_process:
        response = requests.get(url, headers=headers)
        doc = html.fromstring(response.text)
        data_table_elements = doc.find_class('table-stats')
        table_string = html.tostring(data_table_elements[0], encoding='utf-8').decode()
        html_table_parser = HTMLTableParser()
        html_table_parser.feed(table_string)
        data_table = html_table_parser.tables[0]
        df = pd.DataFrame(data_table[1:], columns=data_table[0])
        if result_df is None:
            result_df = df
        else:
            merge_on = ['Jméno', 'Tým', 'Z']
            result_df = result_df.merge(df, left_on=merge_on, right_on=merge_on, )
    return result_df
コード例 #35
0
def get_players_data_from_team_stats(team_name, team_stats_url, headers):
    html_page = get_html_page(team_stats_url, headers)

    team_players_stats = html_page.find_class("tablehead")

    players_stats = []
    players_data_columns = []
    for table in team_players_stats:
        for redundant_el in table.find_class("stathead"):
            redundant_el.getparent().remove(redundant_el)
        table_string = tostring(table, encoding="utf-8")
        table_parser = HTMLTableParser()
        table_parser.feed(table_string.decode("utf-8"))
        if table_parser.tables[0][0][0] == "PLAYER" and "SOG" in table_parser.tables[0][0]:
            players_data_columns = table_parser.tables[0][0]
            for row in table_parser.tables[0][1:]:
                row[0] = extract_player_name(row[0])
                row.append(team_name)
                players_stats.append(row)
            break
    players_data_columns.append("TEAM")
    result_df = pd.DataFrame(columns=players_data_columns, data=players_stats)
    return result_df.apply(pd.to_numeric, errors="ignore")