Exemple #1
0
def login(driver, credential_json, verbose=True):

    info = utils.load_json(credential_json)
    utils.open_url(driver, host + '/id?redirectTo=%2F', verbose=verbose)
    driver.find_element_by_id('Username').send_keys(info['username'])
    driver.find_element_by_id('Password').send_keys(info['password'])
    driver.find_element_by_id('login').click()
Exemple #2
0
def get_filter_options_url(driver, filt_name):
    # filt_name = 'SKILL LEVELS', 'ROLES', 'SUBJECTS TO LEARN'
    # 'TOOLS', 'CERTIFICATIONS', 'AUTHORS'
    # reload the search url
    utils.print_message('get options of filter {} ...'.format(filt_name))
    utils.open_url(driver, search_url, reopen=True)

    opt_url_dict = dict()
    for filt in driver.find_elements_by_xpath(
            '//li[starts-with(@class, "facet__section ")]'):
        if filt_name != filt.find_element_by_xpath('./div/h3').text:
            continue

        # expand option list
        if (filt.get_attribute('class') ==
                'facet__section l-search__facets-list--item'):
            filt.find_element_by_xpath('./div/div').click()

        # get all options
        for opt in filt.find_elements_by_xpath('.//a[@role="checkbox"]'):
            opt_name = opt.get_attribute('aria-label')
            opt_url = opt.get_attribute('href')
            opt_url_dict[opt_name] = opt_url

    utils.print_message('found urls of {} options'.format(len(opt_url_dict)))

    return opt_url_dict
 def open(self, url):
     sentiments = VoiceAnalyzer().recognize()
     if sentiments:
         max_key = max(sentiments, key=sentiments.get)
         if max_key == 'neu' or max_key == 'pos':
             utils.speak(self.response)
             utils.open_url(url)
Exemple #4
0
def get_content(driver, chpt_url, out_html_name, out_audio_name):

    utils.open_url(driver, chpt_url, verbose=True)
    elem = driver.find_element_by_xpath('//div[@id="bible_chapter_content"]')
    save_html(elem.get_attribute('outerHTML'), out_html_name)
    try:
        audio_url = driver.find_element_by_xpath('//audio').get_attribute('src')
        utils.download_file(audio_url, out_audio_name)
    except:
        pass
Exemple #5
0
def league_elo():
    """
    – Rank
    – Team
    – Rating
    – Playoffs %
    – Division %
    – World Series %
    """
    url = 'https://projects.fivethirtyeight.com/2018-mlb-predictions/'
    soup = open_url(url)

    tbody = soup.find('tbody')
    trows = tbody.find_all('tr')

    cols = ['elo_rating', 'playoff_pct', 'division_pct', 'worldseries_pct']

    for row in trows:
        team = row['data-str']
        rating = float(row.find('td', {'class': 'num rating'})['data-val'])
        pcts = [
            float(x['data-val']) for x in row.find_all('td', {'class': 'pct'})
        ]

        row_data = [rating] + pcts
        db_data = {k: v for k, v in zip(cols, row_data)}
        db_data = parse_types(db_data)

        # Clear existing elo document
        tm = convert_name(name=team, how='abbr')
        db.Teams.update({'Tm': tm}, {'$set': {'elo': []}})

        db.Teams.update({'Tm': tm}, {'$push': {'elo': db_data}})
Exemple #6
0
    def get_file(self, file_number=1, return_filename=False):
        """Gets a file that was pasted in, uploaded, or given by a URL.  If multiple
    files are provided, specify the number of the desired file as file_number.
    Returns None if there is no file.  If return_filename is True, returns a
    tuple: (desired_file, filename)."""
        paste_name = 'pfif_xml_' + str(file_number)
        upload_name = 'pfif_xml_file_' + str(file_number)
        url_name = 'pfif_xml_url_' + str(file_number)
        desired_file = None
        filename = None

        if self.request.POST.get(paste_name):
            desired_file = StringIO(self.request.POST[paste_name])
        elif upload_name in self.request.FILES:
            desired_file = StringIO(self.request.FILES[upload_name].read())
            filename = self.request.FILES[upload_name].name
        elif self.request.POST.get(url_name):
            url = self.request.POST[url_name]
            # make a file-like object out of the URL's xml so we can seek on it
            desired_file = StringIO(utils.open_url(url).read())
            filename = url

        if desired_file is not None:
            if return_filename and filename is not None:
                return (desired_file, filename)
            elif return_filename:
                return (desired_file, None)
            else:
                return desired_file
        else:
            if return_filename:
                return (None, None)
            else:
                return None
Exemple #7
0
def get_video_url(driver, clip_url):

    # load the player
    utils.open_url(driver, clip_url)
    utils.wait(5)

    # pause the video
    control_bar = driver.find_element_by_class_name('hidden-bar')
    play_button = driver.find_element_by_id('play-control')
    ActionChains(driver).move_to_element(control_bar).click(
        play_button).perform()

    # get video url
    video_url = driver.find_element_by_tag_name('video').get_attribute('src')

    return video_url
Exemple #8
0
def pitching_logs(team, year):
    """
    Scrape pitching logs from
    baseball-reference.com
    """
    team = convert_name(name=team, how='abbr')
    url = "http://www.baseball-reference.com/teams/tgl.cgi?team={}&t=p&year={}".format(
        team, year)

    soup = open_url(url)

    table = soup.find_all('div', {'class': 'table_outer_container'})[-1]

    # Extract column headers
    cols = [x.text for x in table.find_all('th', {'scope': 'col'})]

    # Extract body of pitching logs table
    tbody = table.find('tbody')
    trows = tbody.find_all('tr')

    # # Clear existing Pitlog document
    db_array = 'Pitlog.{}'.format(year)
    db.Teams.update({'Tm': team}, {'$set': {db_array: []}})

    # Extract pitching logs and push to databse
    for row in trows:
        row_data = [
            x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat'))
        ]
        db_data = {k: v for k, v in zip(cols, row_data)}
        db_data = parse_types(db_data)

        # Insert row indo database
        db.Teams.update({'Tm': team}, {'$push': {db_array: db_data}})
Exemple #9
0
  def get_file(self, file_number=1, return_filename=False):
    """Gets a file that was pasted in, uploaded, or given by a URL.  If multiple
    files are provided, specify the number of the desired file as file_number.
    Returns None if there is no file.  If return_filename is True, returns a
    tuple: (desired_file, filename)."""
    paste_name = 'pfif_xml_' + str(file_number)
    upload_name = 'pfif_xml_file_' + str(file_number)
    url_name = 'pfif_xml_url_' + str(file_number)
    desired_file = None
    filename = None

    if self.request.POST.get(paste_name):
      desired_file = StringIO(self.request.POST[paste_name])
    elif upload_name in self.request.FILES:
      desired_file = StringIO(self.request.FILES[upload_name].read())
      filename = self.request.FILES[upload_name].name
    elif self.request.POST.get(url_name):
      url = self.request.POST[url_name]
      # make a file-like object out of the URL's xml so we can seek on it
      desired_file = StringIO(utils.open_url(url).read())
      filename = url

    if desired_file is not None:
      if return_filename and filename is not None:
        return (desired_file, filename)
      elif return_filename:
        return (desired_file, None)
      else:
        return desired_file
    else:
      if return_filename:
        return (None, None)
      else:
        return None
Exemple #10
0
    def __init__(self, driver, cache_dir, course_id):
        # course_id e.g. embedded-systems-programming

        self.driver = driver
        self.cache_dir = cache_dir

        course_url = host + '/library/courses/' + course_id
        utils.open_url(driver, course_url)
        skip_ads(driver)

        self.meta = CourseMeta(driver, course_url, course_id)

        self.table_of_content = None
        self.transcript = None

        self.description = None
        self.exercise_files = None
Exemple #11
0
def load_all_courses(driver, html_name, num_load=None):

    utils.open_url(driver, host + '/library/search')

    # switch to Course tab
    for elem in driver.find_elements_by_xpath('//li[@class="tab-list__item"]'):
        if elem.text == 'Courses':
            elem.click()

    # define target scraping section
    course_section = driver.find_element_by_xpath(
        '//div[@aria-selected="true"]')

    # expected number
    ncourse_expect = int(
        course_section.find_element_by_xpath(
            './/*[@class="l-search__results-page-info"]').text.split()[1])

    nload = 0
    if num_load:
        nload_max = num_load
    else:
        nload_max = (ncourse_expect // 25) + 3

    while nload < nload_max:
        courses = course_section.find_elements_by_xpath(
            './/li[@class="courses-list__item"]')
        ncourses = len(courses)
        utils.print_message('#load={}, ncourses={}'.format(nload, ncourses))

        nload += 1
        buttons = course_section.find_elements_by_xpath(
            './/a[@class="button button--outlined"]')
        if len(buttons) == 0:
            break

        buttons[0].click()
        utils.wait(3)

    # save html
    utils.save_html(driver, html_name)

    course_list = course_section.find_elements_by_xpath(
        './/li[@class="courses-list__item"]')
    utils.print_message('expect {} courses, loaded {}.'.format(
        ncourse_expect, len(course_list)))
Exemple #12
0
def get_all_courses_per_option(driver, opt_url, wait_time=5):

    utils.open_url(driver, opt_url, reopen=True, verbose=True)
    course.skip_ads(driver)
    switch_to_courses(driver, 'Courses')
    ncourse = find_number_courses(driver)
    utils.print_message('loading {} courses'.format(ncourse))
    load_all_courses(driver, wait_time=wait_time)
    course_id_list = get_course_ids(driver)

    if ncourse != len(course_id_list):
        utils.print_message(
            '*ERROR*: number of courses mismatch, expected {}, loaded {}'.
            format(ncourse, len(course_id_list)))
        raise

    return course_id_list
Exemple #13
0
def get_city_house(user_in_nub,city):

    #创建时间
    create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
    province_dict = get_province_dict()
    city_code_dict = get_citycode_dict()
    city_name_dict = city_dict()
    # 打开数据库连接
    db = pymysql.connect(host='172.20.206.28', port=3306, user='******', password='******', db='autodata-roomprice',
                         charset='utf8')
    cursor = db.cursor()

    # 生成城市url
    result_sum = []
    for i in generate_allurl(user_in_nub,city):
        print(i)
        # 楼盘,楼盘链接
        contents, urls = (get_allurl(i))

        # 对每个楼盘爬取数据
        for content in contents:

            results = []
            re_get = content[0]

            #数据来源
            source = '安居客'
            # 楼盘名
            name = content[1]
            detail, longitude, latitude = open_url(re_get)
            city_name = list(city_name_dict.get(city))[0]
            province = list(province_dict.get(city_name,''))[0]
            district = content[2]
            city_code = city_code_dict.get(city_name,'')
            province_code = city_code_dict.get(province,'')
            district_code = get_district_code(city_name,district,city_code_dict)
            for ele in detail:
                try:
                    house_type = ele[0]
                    area = str(ele[1]).replace('m', '')
                    total_price = int(ele[2])*10000
                    count = ele[3]
                    result = [name, longitude, latitude, province, city_name, district, house_type, area, total_price, source, create_time, count]
                    results.append(result)
                except:
                    pass
            print(results)
            # update_house_price_db(db,cursor,results,table = '''house_price_yyh''')
            try:
                average, metre_average, area_average = get_price(results)
                result_sum.append([name, longitude, latitude, province, city_name, district, province_code, city_code, district_code,average, metre_average, area_average, create_time,source])
            except:
                pass
    # update_community_db(db, cursor, result_sum, table='''community_yyh_tmp''')
    print(result_sum)
    # 关闭数据库连接
    db.close()
Exemple #14
0
def fangraph_splits(year):
    # 5 is for left handed batters, 6 for right handed batters
    for hand in [5, 6]:
        url = """https://www.fangraphs.com/leaderssplits.aspx?splitArr={0}\
               &strgroup=season&statgroup=1&startDate={1}-03-01\
               &endDate={1}-11-01&filter=&position=P&statType=player\
               &autoPt=true&players=&sort=19,-1&pg=0"""\
               .format(hand, year).replace(' ', '')

        soup = open_url(url)

        # Send POST request to get data in csv format
        params = {
            '__EVENTTARGET': 'SplitsLeaderboard$cmdCSV',
            '__EVENTARGUMENT': '',
            'SplitsLeaderboard$dataPlayerId': 'all',
            'SplitsLeaderboard$dataPos': 'P',
            'SplitsLeaderboard$dataSplitArr': '[{}]'.format(hand),
            'SplitsLeaderboard$dataGroup': 'season',
            'SplitsLeaderboard$dataType': '1',
            'SplitsLeaderboard$dataStart': '{}-03-01'.format(year),
            'SplitsLeaderboard$dataEnd': '{}-11-01'.format(year),
            'SplitsLeaderboard$dataSplitTeams': 'false',
            'SplitsLeaderboard$dataFilter': '[]',
            'SplitsLeaderboard$dataAutoPt': 'true',
            'SplitsLeaderboard$dataStatType': 'player',
            'SplitsLeaderboard$dataPlayers': ''
        }

        elems = ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION']

        # Find dynamic parameters in the page html
        more_params = [soup.find('input', {'id': elem}) for elem in elems]
        for param in more_params:
            params.update({param['id']: param['value']})

        req = requests.post(url, data=params).text
        df = pd.read_csv(StringIO(req))

        # Push one row at a time into database
        df_data = df.to_dict(orient='index')
        for key in tqdm(df_data.keys()):
            name = df_data[key]['Name']
            season = df_data[key]['Season']
            player_data = {
                k: v
                for k, v in df_data[key].items()
                if k not in ['Name', 'Season']
            }

            handstr = 'vLHH' if hand == 5 else 'vRHH'
            db_path = 'fg.{}.{}'.format(handstr, season)

            db.Players.update({'Name': name}, {'$set': {db_path: player_data}})
Exemple #15
0
def build_channels():
    url = utils.open_url('https://www.boxplus.com')
    match = re.compile(
        '<p><a href = "https://www\.boxplus\.com/live-tv-guide\?channel=(.+?)">(.+?)</a></p></td></tr>'
    ).findall(url)
    for url, show in match:
        name = url.replace('-', ' ')
        name = name.title()
        if lines == 'false': name = '[B]%s[/B] - %s' % (name, show)
        else: name = '[B]%s[CR]NOW:[/B] %s' % (name, show)
        utils.add_stream(name, url, 'play', icon % url)
Exemple #16
0
    def download_package(self, mirror, package):
        mirror_folder   = self.context.get_mirror_folder(mirror['path'])
        install_path    = self.context.package_list.get_package_file(package)
        install_dir     = os.path.join(mirror_folder, os.path.dirname(install_path))
        install_file    = os.path.join(mirror_folder, install_path)
        # utils.LOG("Install Dir: ", install_dir)
        # utils.LOG("Install File: ", install_file)
        if not os.path.isdir(install_dir):
            os.makedirs(install_dir)

        package_url = mirror['path'] + "/" + install_path
        utils.LOG("Downloading: ", package_url)

        package['progress']['completed_bytes']  = 0
        package['progress']['completed_pct']    = 0
        timebefore = time.time()
        def update_package_progress(chunk, chunk_len, chunk_time = None):
            if chunk_len == 0:
                utils.LOG("  ++++++++++++++  Package %s download complete." % package['name'])
                # and notify the post download queue that we are done...
                self.context.download_queue.task_done()
                self.context.post_download_queue.put({'mirror': mirror,
                                                      'package': package,
                                                      'time': time.time() - timebefore})
            else:
                package['progress']['completed_bytes'] += chunk_len

                package_size = self.context.package_list.get_package_size(package)
                completed_pct = float(package['progress']['completed_bytes']) / float(package_size)

                package['progress']['completed_pct'] = int(100 * completed_pct)
                utils.LOG("=== Package %s - %d out of %d downloaded..." %
                                (package['name'],
                                package['progress']['completed_bytes'],
                                package_size))

        # save url to file
        utils.open_url(package_url, constants.DEFAULT_CHUNK_SIZE,
                 utils.chunk_handler_to_file(install_file),
                 update_package_progress)
Exemple #17
0
def _daily_forecast_from_location_info(location_info,
                                       start_date=None,
                                       num_days=6,
                                       metric=False):
    if not start_date:
        start_date = datetime.date.today()

    # NOTE: the order of the query-string parameters seems to matter; so,
    # we can't use a dictionary to hold the params
    params = location_info + [("format", "24 hourly"),
                              ("startDate", start_date.strftime("%Y-%m-%d")),
                              ("numDays", str(num_days)),
                              ("Unit", "m" if metric else "e")]

    FORECAST_BY_DAY_URL = ("http://www.weather.gov/forecasts/xml"
                           "/sample_products/browser_interface"
                           "/ndfdBrowserClientByDay.php")

    resp = utils.open_url(FORECAST_BY_DAY_URL, params)
    tree = utils.parse_xml(resp)

    if tree.getroot().tag == 'error':
        raise exceptions.NOAAException("Unable to retrieve forecast")

    time_layouts = _parse_time_layouts(tree)
    min_temp_tlk, min_temps = _parse_temperatures_for_type(tree, 'minimum')
    max_temp_tlk, max_temps = _parse_temperatures_for_type(tree, 'maximum')
    conditions_tlk, conditions = _parse_conditions(tree)

    # Time layout keys have to match for us to sequence and group by them
    assert (min_temp_tlk == max_temp_tlk == conditions_tlk)

    time_layout_key = min_temp_tlk
    time_layout = time_layouts[time_layout_key]
    dates = [dt.date() for dt, _ in time_layout]

    forecast = []
    for date, min_temp_value, max_temp_value, condition in zip(
            dates, min_temps, max_temps, conditions):

        # If we're missing any data, don't create the data point
        if utils.any_none([min_temp_value, max_temp_value, condition]):
            continue

        temp_unit = 'C' if metric else 'F'
        min_temp = models.Temperature(min_temp_value, unit=temp_unit)
        max_temp = models.Temperature(max_temp_value, unit=temp_unit)
        datapoint = models.ForecastedCondition(date, min_temp, max_temp,
                                               condition)
        forecast.append(datapoint)

    return forecast
Exemple #18
0
def schedule(team):
    """
    Scrape yankees schedule with results
    from baseball-reference.com
    """
    name = convert_name(team, how='abbr')
    url = "http://www.baseball-reference.com/teams/{}/2018-schedule-scores.shtml".format(
        name)

    soup = open_url(url)
    table = soup.find('table', {'id': 'team_schedule'})

    # Extract schedule columns
    thead = table.find('thead')
    cols = [
        x.text.replace('\xa0', 'Field').replace('.', '')
        for x in thead.find_all('th')
    ]
    upcoming_cols = cols[:6] + ['Time']

    # Extract schedule data
    tbody = soup.find('tbody')
    trows = tbody.find_all('tr')

    # Throw out rows that are duplicates of column headers
    trows = [x for x in trows if 'Gm#' not in x.text]

    # Clear existing Schedule document
    db.Teams.update({'Tm': name}, {'$set': {'Schedule': []}})

    # Extract schedule data one row at a time
    for row in trows:
        row_data = [
            x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat'))
        ]

        # Past game
        if row_data[2] == 'boxscore':
            game_num = row_data[0]
            db_data = {k: v for k, v in zip(cols, row_data)}

        # Upcoming game
        elif row_data[2] == 'preview':
            row_data = row_data[:7]
            game_num = row_data[0]
            db_data = {k: v for k, v in zip(upcoming_cols, row_data)}

        db_data = parse_types(db_data)

        # Insert row into database
        db.Teams.update({'Tm': name}, {'$push': {'Schedule': db_data}})
Exemple #19
0
def _daily_forecast_from_location_info(location_info, start_date=None,
                                       num_days=6, metric=False):
    if not start_date:
        start_date = datetime.date.today()

    # NOTE: the order of the query-string parameters seems to matter; so,
    # we can't use a dictionary to hold the params
    params = location_info + [("format", "24 hourly"),
                              ("startDate", start_date.strftime("%Y-%m-%d")),
                              ("numDays", str(num_days)),
                              ("Unit", "m" if metric else "e")]

    FORECAST_BY_DAY_URL = ("http://www.weather.gov/forecasts/xml"
                           "/sample_products/browser_interface"
                           "/ndfdBrowserClientByDay.php")

    resp = utils.open_url(FORECAST_BY_DAY_URL, params)
    tree = utils.parse_xml(resp)

    if tree.getroot().tag == 'error':
        raise exceptions.NOAAException("Unable to retrieve forecast")

    time_layouts = _parse_time_layouts(tree)
    min_temp_tlk, min_temps = _parse_temperatures_for_type(tree, 'minimum')
    max_temp_tlk, max_temps = _parse_temperatures_for_type(tree, 'maximum')
    conditions_tlk, conditions = _parse_conditions(tree)

    # Time layout keys have to match for us to sequence and group by them
    assert (min_temp_tlk == max_temp_tlk == conditions_tlk)

    time_layout_key = min_temp_tlk
    time_layout = time_layouts[time_layout_key]
    dates = [dt.date() for dt, _ in time_layout]

    forecast = []
    for date, min_temp_value, max_temp_value, condition in zip(
            dates, min_temps, max_temps, conditions):

        # If we're missing any data, don't create the data point
        if utils.any_none([min_temp_value, max_temp_value, condition]):
            continue

        temp_unit = 'C' if metric else 'F'
        min_temp = models.Temperature(min_temp_value, unit=temp_unit)
        max_temp = models.Temperature(max_temp_value, unit=temp_unit)
        datapoint = models.ForecastedCondition(
                date, min_temp, max_temp, condition)
        forecast.append(datapoint)

    return forecast
Exemple #20
0
    def get_mirror_contents(self, mirror, refresh = False):
        """
        Gets the contents of (files hosted by) a mirror, refreshing or
        refetching the setup.bz2 file if necessary.
        Note that when this happens its throughput is also recorded.
        """
        mirror_folder = self.get_mirror_folder(mirror)

        # check if setup.ini at the mirror exists
        mirror_setup_ini = os.path.join(mirror_folder, "setup.ini")
        mirror_setup_exists = os.path.isfile(mirror_setup_ini)
        if refresh or (not mirror_setup_exists):
            utils.LOG("Mirror setup exists: ", mirror_setup_exists, refresh)
            utils.LOG("Downloading setup.bz2... ")
            try:
                setup_file_contents, time_taken = utils.open_url(mirror + "/setup.bz2")

                # TODO: Try downloading setup.ini file if setup.bz2 fails

                # open(mirror_folder + "/setup.bz2", "wb").write(setup_file_contents)
                # now decode it
                decoder = codecs.getdecoder("bz2")
                contents, length = decoder(setup_file_contents)
                open(mirror_setup_ini, "w").write(contents)

                # reset the mirror's throughput 
                self.mirror_list.update_mirror(mirror,
                                                bytes_loaded = length,
                                                time_spent = time_taken,
                                                health = int(length / time_taken))
            except:
                self.mirror_list.update_mirror(mirror, health = "Down")
                return None

        # Load setup.ini and return its contents
        pkgs = packages.read_package_contents(mirror_setup_ini)
        self.mirror_list.update_mirror(mirror, num_packages = len(pkgs))

        # update the package contents - ie which packages are in
        # which mirrors etc
        self.package_list.disable_saves()

        for pkg_obj in pkgs:
            pkg_name = pkg_obj['main']['name']
            if not self.package_list.contains(pkg_name):
                self.package_list.add_package(pkg_name, pkg_obj)

        self.package_list.enable_saves()

        return pkgs
Exemple #21
0
def get_all_htmls():
    '''Download all htmls of Bible in Chinese.

    Output data will be saved in ../data/chinese_cn/*.html
    '''

    driver = utils.start_driver('phantomjs', verbose=True)
    out_dir = '{}/data/chinese_cn'.format(work_dir)
    os.makedirs(out_dir, exist_ok=True)

    try:
        for i in range(1, 74):
            url = ('http://xiaozhushou.org/index.php/?m=bible&template={}'
                   .format(i))
            utils.open_url(driver, url, verbose=True)
            chpt_url_list = []
            for elem in driver.find_elements_by_xpath(
                    '//ul[@id="chapter_list"]/li/a'):
                chpt_url = elem.get_attribute('href')
                chpt_url_list.append(chpt_url)

            for chpt_url in chpt_url_list:
                book_id = str(i).zfill(3)
                chpt_id = chpt_url.split('=')[-1].zfill(3)
                out_html_name = ('{}/{}_{}_chapter.html'
                                 .format(out_dir, book_id, chpt_id))
                out_audio_name = ('{}/{}_{}_audio.mp3'
                                 .format(out_dir, book_id, chpt_id))
                get_content(driver, chpt_url, out_html_name, out_audio_name)

    except:
        print('*ERROR* something wrong')
        raise

    finally:
        utils.close_driver(driver, verbose=True)
Exemple #22
0
def forty_man(team, year):
    """
    Extract 40-man roster from
    baseball-reference.com
    """
    team = convert_name(name=team, how='abbr')
    base = "http://www.baseball-reference.com"
    url = base + "/teams/{}/{}-roster.shtml".format(team, year)
    soup = open_url(url)

    table = soup.find('table', {'id': 'the40man'})

    # Extract column headers and rename blank columns
    thead = table.find('thead')
    cols = [x.text for x in thead.find_all('th')]
    cols[3], cols[4] = 'Country', 'Pos'

    # Extract body of fort man table
    tbody = table.find('tbody')
    trows = tbody.find_all('tr')

    # Clear existing Fortyman document
    db_array = 'Fortyman.{}'.format(year)
    db.Teams.update({'Tm': team}, {'$set': {db_array: []}})

    # Extract forty-man roster and push to database
    for row in tqdm(trows):
        bid = row.find('a')['href'].split('=')[-1]
        row_data = [
            x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat'))
        ]
        db_data = {k: v for k, v in zip(cols, row_data)}
        db_data.update({'bid': bid})
        db.Teams.update({'Tm': team}, {'$push': {db_array: db_data}})

        # Check if player exists in database
        player = db_data['Name']
        exists = dbc.player_exists(player)
        if not exists:
            try:
                print("Scraping br data for {}".format(player))
                br_player_stats(player, team)
            except:
                print("Unable to scrape br data for {}".format(player))
def notify_lag_update(commit):
    global older_version
    global updated_version

    diff_string = utils.open_url(
        "https://github.com/ballerina-platform/ballerina-release/commit/" +
        commit + ".diff").read().decode("utf-8")

    for line in diff_string.splitlines():
        if line.startswith("-"):
            older_version.append(line[1:])
        elif line.startswith("+"):
            updated_version.append(line[1:])

    older_version = older_version[1:]
    updated_version = updated_version[1:]

    remove_statement_changes()

    create_message()
Exemple #24
0
def current_injuries(team):
    """
    Extract current injuries table
    from baseball-reference.com
    """
    current_year = datetime.date.today().strftime('%Y')

    team = convert_name(name=team, how='abbr')

    url = "http://www.baseball-reference.com/teams/{}/{}.shtml"\
                                            .format(team, current_year)
    soup = open_url(url)

    # Data is stored in html comment
    comment = soup.find_all(string=lambda text: isinstance(text, Comment))
    comment_html = [x for x in comment if 'Injuries Table' in x][-1].string

    table = BeautifulSoup(comment_html, "html.parser")

    # Extract column headers
    thead = table.find('thead')
    cols = [x.text for x in thead.find_all('th')]

    # Extract body from injuries table
    tbody = table.find('tbody')
    trows = tbody.find_all('tr')

    # Clear existing injuries document
    db.Teams.update({'Tm': team}, {'$set': {'Injuries': []}})

    # Extract injuries table and push to database
    for row in trows:
        row_data = [
            x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat'))
        ]
        db_data = {k: v for k, v in zip(cols, row_data)}
        db_data = parse_types(db_data)
        db.Teams.update({'Tm': team}, {'$push': {'Injuries': db_data}})
Exemple #25
0
def geocode_location(location, api_key=None):
    """Use Google to geocode a location string.

    For high-volume traffic, you will need to specify an API-key.
    """
    GEOCODE_URL = "http://maps.google.com/maps/geo"
    params = [("q", location), ("sensor", "false"), ("output", "json")]

    if api_key:
        params += [("key", api_key)]

    resp = utils.open_url(GEOCODE_URL, params)
    data = json.loads(resp.read())

    if data["Status"]["code"] != 200:
        raise exceptions.GeocodeException("Unable to geocode this location")

    best_match = data["Placemark"][0]
    address = best_match["address"]
    lon, lat, _ = best_match["Point"]["coordinates"]

    location = models.Location(lat, lon, address)
    return location
Exemple #26
0
def geocode_location(location, api_key=None):
    """Use Google to geocode a location string.

    For high-volume traffic, you will need to specify an API-key.
    """
    GEOCODE_URL = "http://maps.google.com/maps/geo"
    params = [('q', location), ('sensor', 'false'), ('output', 'json')]

    if api_key:
        params += [('key', api_key)]

    resp = utils.open_url(GEOCODE_URL, params)
    data = json.loads(resp.read())

    if data['Status']['code'] != 200:
        raise exceptions.GeocodeException('Unable to geocode this location')

    best_match = data['Placemark'][0]
    address = best_match['address']
    lon, lat, _ = best_match['Point']['coordinates']

    location = models.Location(lat, lon, address)
    return location
Exemple #27
0
def standings():
    """
    Scrape MLB standings from baseball-reference.com
    """
    url = "http://www.baseball-reference.com/leagues/MLB-standings.shtml"
    soup = open_url(url)

    # Scrape division identifier data
    div_data, gb_data = {}, {}
    divs = ['E', 'C', 'W']
    for div in divs:
        table = soup.find_all('div', {'id': 'div_standings_{}'.format(div)})

        trows = [x.find_all('tr') for x in table]
        trows_flat = [x for y in trows for x in y]

        teams_html = [x.find('a') for x in trows_flat if x.find('a')]
        team_names = [x['href'].split('/')[2] for x in teams_html]

        gbs = [
            x.find('td', {
                'data-stat': 'games_back'
            }).text for x in trows_flat if x.find('td')
        ]

        div_dict = {k: div for k in team_names}
        div_data.update(div_dict)

        gb_dict = {k: v for k, v in zip(team_names, gbs)}
        gb_data.update(gb_dict)

    # Scrape full league standings
    comment = soup.find_all(string=lambda text: isinstance(text, Comment))
    comment_html = [x for x in comment if '<td' in x][-1].string

    comment_soup = BeautifulSoup(comment_html, "html.parser")

    # Extract table column headers
    thead = comment_soup.find('thead')
    cols = [x.text.replace('.', '') for x in thead.find_all('th')]

    # Extract table body
    tbody = comment_soup.find('tbody')
    trows = tbody.find_all('tr')

    for row in trows:
        row_data = [
            x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat'))
        ]

        # Skip last row (league averages)
        if row_data[0]:
            team = row_data[1]
            db_data = {k: v for k, v in zip(cols, row_data)}

            # Add division and gb information
            division = '{}-{}'.format(db_data['Lg'], div_data[team])
            games_behind = gb_data[team]
            db_data.update({'div': division})
            db_data.update({'gb': games_behind})

            # Store int/float when possible
            db_data = parse_types(db_data)

            # Insert row into database
            db.Teams.update({'Tm': team}, {'$set': db_data}, upsert=True)
Exemple #28
0
def fangraphs(state, year):
    """
    Scrape data from fangraphs.com
    """
    tid = 0  # Scrape all teams for now, add individual teams later if needed

    url = """http://www.fangraphs.com/leaders.aspx?pos=all&stats={0}\
             &lg=all&qual=0&type=8&season={1}\
             &month=0&season1={1}\
             &ind=0&team={2}&page=1_1000"""\
             .format(state, year, tid)\
             .replace(' ', '')

    soup = open_url(url)

    # Extract column headers
    thead = soup.find('thead')
    cols = [x.text for x in thead.find_all('th')]

    # Extract stats from table bdoy
    tbody = soup.find_all('tbody')[-1]
    all_rows = tbody.find_all('tr')
    all_row_data = [x.find_all('td') for x in all_rows]

    for row in tqdm(all_row_data):
        row_data = [x.text for x in row]
        player = row_data[1]
        db_data = {k: v for k, v in zip(cols, row_data)}

        # Rename common keys with batting or pitching prefixes
        rank = '{}_rank'.format(state)
        db_data[rank] = db_data.pop('#')

        war = '{}_WAR'.format(state)
        db_data[war] = db_data.pop('WAR')

        games = '{}_G'.format(state)
        db_data[games] = db_data.pop('G')

        # Convert team name to abbreviation
        try:
            db_data['Team'] = convert_name(db_data['Team'])
        except:
            pass  # any need to pull team value from br here?
            # print("(fangraphs) No team listed for {}".format(player))

        # Store type as numeric if possible
        db_data = parse_types(db_data)

        # Insert row into database
        db_path = 'fg.{}.{}'.format(state, year)
        db.Players.update({'Name': player}, {'$set': {
            db_path: db_data
        }},
                          upsert=True)

        # Add current team to top level
        if year == dbc._year:
            db.Players.update({'Name': player},
                              {'$set': {
                                  'Team': db_data['Team']
                              }})
Exemple #29
0
def get_stream(url):
    url = utils.open_url('https://www.boxplus.com/live-tv-guide?channel=' +
                         url)
    match = re.compile("src: '(.+?)',").findall(url)
    for url in match:
        utils.play_stream(url)
def check_pending_build_checks(index: int):
    module = current_level_modules[index]
    global status_completed_modules
    print("[Info] Checking the status of the timestamped build in module '" +
          module['name'] + "'")
    passing = True
    pending = False
    build_check_found = False  # This is to stop intermittent failures
    repo = github.get_repo(constants.BALLERINA_ORG_NAME + '/' + module['name'])
    pull_request = repo.get_pull(module[MODULE_CREATED_PR].number)
    sha = pull_request.merge_commit_sha

    failed_build_name, failed_build_html = [], []
    if module[MODULE_CONCLUSION] == MODULE_CONCLUSION_BUILD_PENDING:
        for build_check in repo.get_commit(sha=sha).get_check_runs():
            build_check_found = True
            # Ignore codecov checks temporarily due to bug
            if not build_check.name.startswith('codecov'):
                if build_check.status != 'completed':
                    pending = True
                    break
                elif build_check.conclusion == 'success':
                    continue
                else:
                    failed_build_name.append(build_check.name)
                    failed_build_html.append(build_check.html_url)
                    passing = False
        if build_check_found and not pending:
            if passing:
                current_level_modules[index][
                    MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_SUCCESS
            else:
                current_level_modules[index][
                    MODULE_STATUS] = MODULE_STATUS_COMPLETED
                current_level_modules[index][
                    MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_FAILURE
                module_name = module['name']
                print(
                    "[Error] Dependency bump PR merge build checks have failed for '"
                    + module_name + "'")
                for name, html_url in zip(failed_build_name,
                                          failed_build_html):
                    print("[" + module_name + "] Build check '" + name +
                          "' failed for " + html_url)
                status_completed_modules += 1
    else:
        # Already successful and merged
        current_level_modules[index][
            MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_SUCCESS

    if current_level_modules[index][
            MODULE_CONCLUSION] == MODULE_CONCLUSION_BUILD_SUCCESS:
        if current_level_modules[index]['name'] == 'ballerina-distribution':
            current_level_modules[index][
                MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_RELEASED
        else:
            try:
                packages_url = 'https://api.github.com/orgs/' + constants.BALLERINA_ORG_NAME + '/packages/maven/' \
                               + module['group_id'] + '.' + module['artifact_id'] + '/versions'
                packages_list_string = utils.open_url(packages_url).read()
                packages_list = json.loads(packages_list_string)
                latest_package = packages_list[0]['name']

                if retrigger_dependency_bump.lower() == 'true':
                    for package in packages_list:
                        sha_of_released_package = package['name'].split(
                            '-')[-1]
                        if sha_of_released_package in sha:
                            latest_package = package['name']
                            break

                current_level_modules[index][
                    MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_RELEASED
                current_level_modules[index][
                    MODULE_TIMESTAMPED_VERSION] = latest_package
            except Exception as e:
                print(
                    "[Error] Failed to get latest timestamped version for module '"
                    + module['name'] + "'", e)
                current_level_modules[index][
                    MODULE_STATUS] = MODULE_CONCLUSION_VERSION_CANNOT_BE_IDENTIFIED
        current_level_modules[index][MODULE_STATUS] = MODULE_STATUS_COMPLETED
        status_completed_modules += 1
Exemple #31
0
def br_player_stats(name, team):
    brid = dbc.get_player_brid(name, team)
    base = 'https://www.baseball-reference.com/redirect.fcgi?player=1&mlb_ID='
    url = base + brid

    redirect = requests.get(url).url
    soup = open_url(redirect)

    # Extract Standard Batting/Pitching table
    table = soup.find('div', {'class': 'table_outer_container'})

    thead = table.find('thead')
    cols = [x.text for x in thead.find_all('th')]
    pit_or_bat = table.find('caption').text

    tbody = table.find('tbody')
    trows = tbody.find_all('tr')

    # Push to Players collection
    for row in trows:
        if row.find('th', {'data-stat': 'year_ID'}).text:
            row_data = [
                x.text
                for x in row.find_all(lambda tag: tag.has_attr('data-stat'))
            ]
            db_data = {k: v for k, v in zip(cols, row_data)}
            db_data = parse_types(db_data)

            # Skip blank rows and don't collect minor league data
            if not row_data[0] or db_data['Lg'] not in ['AL', 'NL']:
                continue

            db_array = 'br.{}.{}'.format(pit_or_bat, db_data['Year'])
            db.Players.update({'Name': name},
                              {'$set': {
                                  'brID': brid,
                                  db_array: db_data
                              }},
                              upsert=True)

    # Extract Player Value Table - Stored in html comment
    comment = soup.find_all(string=lambda text: isinstance(text, Comment))
    comment_html = [x for x in comment if 'Player Value' in x]

    for c in comment_html:
        table = BeautifulSoup(c.string, "html.parser")

        table_name = table.find('caption').text.replace('--', ' ').split()
        title = '{} {}'.format(table_name[2], table_name[1])

        thead = table.find('thead')
        cols = [x.text for x in thead.find_all('th')]

        tbody = table.find('tbody')
        trows = tbody.find_all('tr')

        for row in trows:
            row_data = [
                x.text
                for x in row.find_all(lambda tag: tag.has_attr('data-stat'))
            ]
            db_data = {k: v for k, v in zip(cols, row_data)}
            db_data = parse_types(db_data)

            # Rename stats to match fg data
            renames = {'BA': 'AVG'}
            for stat in rename.keys():
                db_data[renames[stat]] = db_data[stat]
                db_data.pop(stat)

            # Skip blank rows and don't collect minor league data
            if not row_data[0] or db_data['Lg'] not in ['AL', 'NL']:
                continue

            db_array = 'br.{}.{}'.format(title, db_data['Year'])
            db.Players.update({'brID': brid}, {'$set': {
                db_array: db_data
            }},
                              upsert=True)
Exemple #32
0
 def load_p(dir_or_url):
     name = dir_or_url + '/params'
     return load_dict(open_url(name))
Exemple #33
0
 def load_model_params(load_from):
     filename = "trained_params.npz"
     f = open_url(load_from + '/' + filename)
     return numpy.load(f)
Exemple #34
0
import csv
import sys
import os
from selenium import webdriver
from utils import pause, load_config, get_table, login, open_url

CONFIG = load_config("configuration.yml")
email_address = CONFIG["CREDENTIALS"]["USERNAME"]
password = CONFIG["CREDENTIALS"]["PASSWORD"]
url = CONFIG["URL"]

driver = webdriver.Chrome()

open_url(driver, url)
login(driver, email_address, password)
open_url(driver, url)
get_table(driver)
Exemple #35
0
def boxscores(date, dbc=dbc):
    """
    Extract all boxscores
    """

    # Delete games with postponed status to avoid conflict
    dbc.delete_duplicate_game_docs()

    year = datetime.date.today().strftime('%Y')

    if date == 'all':
        url = 'https://www.baseball-reference.com/leagues/MLB/{}-schedule.shtml'.format(
            year)

        soup = open_url(url)

        # Find links for each game this season
        all_game_urls = [
            x['href'] for x in soup.find_all('a', href=True)
            if x.text == 'Boxscore'
        ]

        dates = dbc.get_missing_array_dates('summary')

        # Format dates to match date in url
        datesf = [x.replace('-', '') for x in dates]

        # Filter games by missing dates
        game_urls = [
            game for game in all_game_urls
            if any(date in game for date in datesf)
        ]
    else:
        y, m, d = date.split('-')

        url = "http://www.baseball-reference.com/boxes/?year={}\
               &month={}\
               &day={}"\
               .format(y,m,d)\
               .replace(' ', '')

        soup = open_url(url)

        game_urls = [
            x.a['href']
            for x in soup.find_all('td', {'class': 'right gamelink'})
        ]

    # Collect boxscore stats on each game
    for game in tqdm(game_urls):
        url = 'http://www.baseball-reference.com' + game
        soup = open_url(url)

        html_date = ''.join(soup.find('h1').text.split(',')[-2:]).strip()
        date = str(datetime.datetime.strptime(html_date, '%B %d %Y').date())

        tdiv = soup.find_all('div', {'itemprop': 'performer'})
        away = tdiv[0].find('a', {'itemprop': 'name'})['href'].split('/')[2]
        home = tdiv[1].find('a', {'itemprop': 'name'})['href'].split('/')[2]

        away = convert_name(away, how='abbr')
        home = convert_name(home, how='abbr')
        teams = (away, home)

        # Find mlb game id and resolve double headers
        url_id = int(game.split('.')[0][-1])
        games = list(dbc.get_team_game_preview(away, date))
        gnums = [
            int(game['preview'][0]['gameData']['game']['gameNumber'])
            for game in games
        ]

        # !!! remove try/except here since postponed games are now removed?
        try:
            idx = gnums.index(url_id) if url_id > 0 else 0
        except:
            idx = 0
        gid = games[idx]['gid']

        # Extract summary stats
        summary = soup.find('table', {'class': 'linescore'})

        thead = summary.find('thead')
        cols = [x.text for x in thead.find_all('th')][1:]
        cols[0] = 'Team'

        tbody = summary.find('tbody')
        trows = tbody.find_all('tr')

        # Push summary stats to database
        db.Games.update({'gid': gid}, {'$set': {'summary': []}})

        for row in trows:
            row_data = [x.text for x in row.find_all('td')][1:]
            db_data = {k: v for k, v in zip(cols, row_data)}
            db.Games.update({'gid': gid}, {'$push': {'summary': db_data}})

        # Extract batting box score
        comment = soup.find_all(string=lambda text: isinstance(text, Comment))
        bat_tables = [x for x in comment if '>Batting</th>' in x]

        for table in zip(teams, bat_tables):
            team = table[0]
            bat = BeautifulSoup(table[1], "html.parser")

            # Extract column headers
            thead = bat.find('thead')
            cols = [x for x in thead.find('tr').text.split('\n') if x]

            # Extract Team Totals
            tfoot = bat.find('tfoot')
            row_data = [
                x.text
                for x in tfoot.find_all(lambda tag: tag.has_attr('data-stat'))
            ]
            db_data = {k: v for k, v in zip(cols, row_data)}
            db_data = parse_types(db_data)
            db_array = '{}.batting'.format(team)
            db.Games.update({'gid': gid}, {'$set': {db_array: [db_data]}})

            # Extract stats on individual batters
            tbody = bat.find('tbody')
            trows = tbody.find_all('tr')
            for row in trows:
                try:
                    player = row.find('a').text
                except:
                    continue
                stats = [
                    x.text for x in row.find_all(
                        lambda tag: tag.has_attr('data-stat'))
                ]
                stats[0] = player
                db_data = {k: v for k, v in zip(cols, stats)}
                db_data = parse_types(db_data)
                db_array = '{}.batting'.format(team)
                db.Games.update({'gid': gid}, {'$push': {db_array: db_data}})

        # Extract pitching box score
        pit_tables = [x for x in comment if '>Pitching</th>' in x][0]
        pit = BeautifulSoup(pit_tables, "html.parser")

        # Extract column headers
        thead = pit.find('thead')
        cols = [x for x in thead.find('tr').text.split('\n') if x]

        # Extract Team Totals
        tfoots = pit.find_all('tfoot')
        for foot in zip(teams, tfoots):
            team = foot[0].replace('.', '')
            row_data = [
                x.text for x in foot[1].find_all(
                    lambda tag: tag.has_attr('data-stat'))
            ]
            db_data = {k: v for k, v in zip(cols, row_data)}
            db_data = parse_types(db_data)
            db_array = '{}.pitching'.format(team)
            db.Games.update({'gid': gid}, {'$set': {db_array: [db_data]}})

        # Extract stats on individual pitchers
        tbodies = pit.find_all('tbody')
        for tbody in zip(teams, tbodies):
            team = tbody[0].replace('.', '')
            trows = tbody[1].find_all('tr')
            for row in trows:
                player = row.find('th').text.split(',')[0]
                stats = [x.text for x in row.find_all('td')]
                stats.insert(0, player)
                db_data = {k: v for k, v in zip(cols, stats)}
                db_data = parse_types(db_data)
                db_array = '{}.pitching'.format(team)
                db.Games.update({'gid': gid}, {'$push': {db_array: db_data}})