コード例 #1
0
def get_article_for_packerswire(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('div', {'class': 'articleBody'}).find('p')
        return _arranged(lines)
    except Exception:
        return ''
コード例 #2
0
def get_index_components(query: str = 'https://stooq.pl/q/i/?s=mwig40'):
    '''
    Returns components of an index that is specified for a given
    index (in the case of Polish Stock Exchange (GPW) that would be
    for example mWig40)

    # For the time being, I will set the 'query' argument to download
    # mWig 40 data. Changing this to some other index (WIG20) for example
    # should be pretty straightforward then.

    '''
    htmlContents = []
    companies = []
    html = get(query)
    soup = Soup(html)

    data_table = soup.find('tbody')

    for i in data_table:
        if i.find('font') == None:
            continue
        htmlContents.append(i.find('font'))

    for element in htmlContents[0]:
        if element.find('a') == None:
            continue

        companies.append(re.findall(">(.*)</a>", str(element.find('a')))[0])

    return companies
コード例 #3
0
def get_article_for_dev_to(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('div', {'id': 'article-body'}).find('p')
        return _arranged(lines)
    except Exception:
        return ''
コード例 #4
0
def test_get_headers():
    url = "https://httpbin.org/headers"
    UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0"
    headers = {"User-Agent": UA}
    content = get(url, headers=headers)
    user_agent = json.loads(content)["headers"]["User-Agent"]
    assert user_agent == UA
コード例 #5
0
def get_game_stats(url):
    url += "/stats"
    html = get(url)
    soup = Soup(html)
    rows = soup.find("div", {"class": "BoxScore__statLine"})
    data = [parse_stat_row(row) for row in rows]
    return data
コード例 #6
0
def get_article(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('p')
        return _arranged(lines)
    except Exception:
        return ''
コード例 #7
0
def comprobar_estado(usuario, sl):
    url_user = url + usuario + '/?hl=es'
    try:
        html = get(url_user)
    except:
        return None

    html_s = Soup(html)
    a = html_s.find('meta', attrs={'property': 'og:description'})
    dic = a.attrs
    n = dic['content'].replace('-', ',')
    # arreglar el problema de las ',' en los números
    new_n = n[0]
    for i in range(1, len(n)):
        if n[i - 1].isdigit() and n[i] == ',' and n[i + 1].isdigit():
            new_n += '.'
        else:
            new_n += n[i]
    n = new_n
    n = n.split(',')
    data = (x.strip() for x in n)
    # ('431 Followers', '872 Following', '294 Posts', 'See Instagram photos and videos from JP (@juanpedro)')

    seguidores, seguidos, publicaciones, usuario = data
    *_, usuario = usuario.split(' ')
    # '@juanpedro'
    if '(' in usuario:
        usuario = usuario[1:-1]

    if sl:
        nombre, tipo = paser_selenium(url_user)
    else:
        nombre, tipo = '¿nombre?', 'No tienes selenium instaldo, imposible sacar el tipo cuenta...'

    return usuario, nombre, seguidores, seguidos, publicaciones, tipo
コード例 #8
0
def test_get_headers():
    url = 'https://httpbin.org/headers'
    UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0'
    headers = {'User-Agent': UA}
    content = get(url, headers=headers)
    user_agent = json.loads(content)['headers']['User-Agent']
    assert user_agent == UA
コード例 #9
0
def get_currencies_rates(url: str) -> dict:
    """Return dictionary with exchange rates"""

    result: Dict[str, dict] = {}

    response = None

    try:
        response = get(url)
    except (HTTPError, URLError) as err:
        logging.exception(err)

    if response:
        soup = Soup(response)
        currencies = [
            cur.find("a") for cur in soup.find("div", {"class": NAME})
        ]
        buy = soup.find("div", {"class": BUY})
        buy_values = [value.find("div", {"class": NUM}).text for value in buy]
        sale = soup.find("div", {"class": SALE})
        sale_values = [
            value.find("div", {
                "class": NUM
            }).text for value in sale
        ]

        for cur, buy_num, sale_num in zip(currencies, buy_values, sale_values):
            cur = cur.text.split()[-1]
            result[cur] = {"buy": float(buy_num), "sale": float(sale_num)}

    return result
コード例 #10
0
ファイル: projections.py プロジェクト: rush175/sticky
def scrape_position_cbs(position):
    url = f'https://www.cbssports.com/fantasy/hockey/stats/{position}/2019/season/projections/'
    html = get(url)
    soup = Soup(html)
    table = soup.find('table', {'class': 'TableBase-table'})
    pdf = pd.read_html(str(table))[0]
    return pdf
コード例 #11
0
ファイル: scraper.py プロジェクト: rasPanda/GA-Project4
def scrape():
    url = request.args.get('url')
    html = get(url)
    soup = Soup(html)

    name = (soup.find('meta', attrs={'property': "og:title"}, mode='first'))
    if name != None:
        name = name.attrs['content']
    else:
        name = (soup.find('title', mode='first')).text

    description = soup.find('meta',
                            attrs={'property': "og:description"},
                            mode='first')
    if description != None:
        description = description.attrs['content']
    else:
        description = soup.find('meta',
                                attrs={'name': "description"},
                                mode='first')
        if description != None:
            description = description.attrs['content']

    image = soup.find('meta', attrs={'property': "og:image"}, mode='first')
    if image != None:
        image = image.attrs['content']
    else:
        image = soup.find('meta', attrs={'name': "image"}, mode='first')
        if image != None:
            image = image.attrs['content']

    price = soup.find('meta',
                      attrs={'property': "og:price:amount"},
                      mode='first')
    if price != None:
        price = price.attrs['content']
    else:
        price = soup.find('meta', attrs={'name': "price"}, mode='first')
        if price != None:
            price = price.attrs['content']

    vendor = soup.find('meta',
                       attrs={'property': "og:site_name"},
                       mode='first')
    if vendor != None:
        vendor = vendor.attrs['content']
    else:
        vendor = soup.find('meta', attrs={'name': "site_name"}, mode='first')
        if vendor != None:
            vendor = vendor.attrs['content']

    return {
        "name": name if name != None else '',
        "description": description if description != None else '',
        "image": image if image != None else '',
        "price": price if price != None else '',
        "vendor": vendor if vendor != None else '',
        "dest_url": url
    }, 200
コード例 #12
0
def get_article_for_packers(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('div',
                          {'class': 'nfl-c-body-part nfl-c-body-part--text'})
        return _arranged(lines)
    except Exception:
        return ''
コード例 #13
0
def existe_usuario(usuario):
    url_user = url + usuario + '/?hl=es'
    try:
        html = get(url_user)
    except:
        return None

    return html
コード例 #14
0
ファイル: bookbot.py プロジェクト: maxhumber/AWS
def fetch_sale():
    url = "https://scrape.world/books"
    html = get(url)
    soup = Soup(html)
    books_raw = soup.find("div", {"class": "book-"})
    books = [parse(book) for book in books_raw]
    on_sale = [name for name, price in books if price == 0.99]
    return "\n".join(on_sale)
コード例 #15
0
ファイル: projections.py プロジェクト: rush175/sticky
def scrape_position_numberfire(position):
    url = f'https://www.numberfire.com/nhl/fantasy/yearly-projections/{position}'
    html = get(url)
    soup = Soup(html)
    tables = soup.find('table', {'class': 'projection-table'})
    names = pd.read_html(str(tables[0]))[0]
    data = pd.read_html(str(tables[1]))[0]
    df = pd.concat([names, data], axis=1)
    return df
コード例 #16
0
def scrape():
    from gazpacho import get, Soup
    url = 'https://front.njpwworld.com/search/latest?page=465'
    soup = Soup(get(url))
    movie_areas = soup.find('div', {'class': 'movieArea'})
    links = list(map(lambda m: m.find('a'), movie_areas))
    for link in links:
        movie_id = link[0].attrs['href'].replace('/p/', '')
        url = f'{endpoint_movie}{movie_id}'
        res = requests.post(url)
コード例 #17
0
    def nonLoggingSearch(self, name):
        base_url = 'https://m.facebook.com'
        nameAndSurname = name.split(' ')
        firstName = nameAndSurname[0]
        lastName = nameAndSurname[1]
        url = base_url + '/public/' + firstName + '+' + lastName
        cont = get(url, headers={':authority:': 'www.facebook.com', ':method:': 'GET',
                                 ':scheme:': 'https',
                                 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                                 'accept-encoding': 'gzip, deflate, br',
                                 'accept-language': 'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7', 'cache-control': 'max-age=0',
                                 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '******',
                                 'upgrade-insecure-requests': '1',
                                 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
                                 'viewport-width': '1536'})
        content = requests.get(url).text
        soup = bs4.BeautifulSoup(content, "html.parser")
        linesWithData = []
        hrefs = []
        links = []
        with open('fbcontent.txt', 'wb') as file:
            file.write(bytes(cont, encoding='utf-8'))
        with open('fbcontent.txt', 'rb') as file:
            for line in file:
                line = str(line, encoding='utf-8')
                if line.find('hidden_elem') != -1 and line.startswith('<div class="hidden_elem">'):
                    linesWithData.append(line)
                elif line.find('href="/') != -1:
                    linesWithData.append(line)

        tags = []
        for elem in linesWithData:
            for i in range(len(elem)):
                if elem[i] == '<':
                    tag = ''
                    tag = tag + elem[i]
                    j = i
                    while elem[j] != '>':
                        tag = tag + elem[j]
                        j = j + 1
                    if elem[j] == '>':
                        tag = tag + elem[j]
                        tags.append(tag)
        for tag in tags:
            if tag.find('href') != -1:
                hrefs.append(tag)
        for href in hrefs:
            split = href.split(' ')
            for part in split:
                if part.find('href') != -1 and part.find('https://') == -1 and part.find('=https') == -1 and (part.lower().find(nameAndSurname[0].lower()) != -1 or part.lower().find(nameAndSurname[1].lower()) != -1):
                    corrected_link = self.getUrlFromHref(part)
                    if corrected_link is not None and corrected_link.endswith('/photos') is False:
                        links.append(base_url + corrected_link)
        links = set(links)
        return links
コード例 #18
0
def get_boxscore_urls(date):
    if isinstance(date, pd.Timestamp):
        date = date.strftime("%Y-%m-%d")
    url = f"{base}/nba/events/date/{date}"
    html = get(url)
    soup = Soup(html)
    games = soup.find("div", {
        'class': "Layout__content"
    }).find('a', mode='all')
    urls = [base + game.attrs['href'] for game in games]
    return urls
コード例 #19
0
def get_game_stats(url):
    url += "/stats"
    html = get(url)
    soup = Soup(html)
    rows = soup.find("div", {"class": "BoxScore__statLine"})
    data = []
    for row in rows:
        try:
            data.append(parse_stat_row(row))
        except:
            pass
    return data
コード例 #20
0
ファイル: projections.py プロジェクト: rush175/sticky
def scrape_daily_faceoff():
    URL = 'https://www.dailyfaceoff.com/fantasy-hockey-projections/'
    html = get(URL)
    soup = Soup(html)
    df = pd.DataFrame()
    for id in ['igsv', 'igsv-1']:
        table = soup.find(
            'table',
            {'id': f'{id}-1N8XNZpOIb8-6WcOPANqSHRyHBXlwZ6X_1vgGyDbETm4'})
        df = df.append(pd.read_html(str(table))[0])
    df = df.reset_index(drop=True)
    return df
コード例 #21
0
ファイル: projections.py プロジェクト: rush175/sticky
def yahoo_draft_rankings():
    URL = 'https://www.fantasypros.com/nhl/adp/overall.php'
    html = get(URL)
    soup = Soup(html)
    df = pd.read_html(str(soup.find('table')))[0]
    df[['first', 'last', 'team']] = df['Player Team'].str.split(' ',
                                                                n=2,
                                                                expand=True)
    df['name'] = df['first'] + ' ' + df['last']
    df.columns = [c.lower() for c in df.columns]
    df = df[['name', 'yahoo']]
    return df
コード例 #22
0
def make_soup(date):
    if not isinstance(date, pd.Timestamp):
        date = pd.Timestamp(date)
    params = {
        "StationID": 31688,
        "Year": date.year,
        "Month": date.month,
        "Day": date.day
    }
    html = get(url, params)
    soup = Soup(html)
    return soup
コード例 #23
0
def get_places(episode):

    url = episode
    html = get(url)
    soup = Soup(html)
    table = soup.find('div', {'class': 'entry-content'})
    links = table.find('a')
    place = [i.text for i in links][:-6]
    places = []
    for i in place:
        if 'map' not in i:
            places.append(i)
    return places
コード例 #24
0
def scrape_script(episode):
    url = 'https://seinfeldscripts.com/' + str(episode)
    html = get(url)
    soup = Soup(html)
    table = soup.find('div', {'id': 'content'})
    script = table.find('p')
    scrip = [i.remove_tags() for i in script]
    lines = same_line(scrip)
    scri = [i.replace('\n', '') for i in lines]
    spaces = [re.sub(' +', ' ', i) for i in scri]
    lines = same_line(spaces)
    bracks = [re.sub('\[.*?\]', '', i) for i in lines]
    return bracks
コード例 #25
0
def download(player_id):
    url = f'https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020'
    html = get(url)
    soup = Soup(html)
    table = soup.find('table', {'id': "gamelog"})
    df = pd.read_html(str(table))[0]
    df.columns = ['_'.join(col) for col in df.columns]
    df['name'] = soup.find('h1').text
    df['player_id'] = player_id
    meta = soup.find('div', {
        'id': 'meta'
    }).find('p', mode='first').remove_tags()
    df['position'] = meta.split(': ')[1].split(' •')[0]
    return df
コード例 #26
0
ファイル: scrape.py プロジェクト: strawberrysocialist/DE4DS
def download_player(player_id):
    url = f"https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020"
    html = get(url)
    soup = Soup(html)
    table = soup.find("table", {"id": "gamelog"})
    df = pd.read_html(str(table))[0]
    df.columns = ["_".join(col) for col in df.columns]
    df["name"] = soup.find("h1").text
    df["player_id"] = player_id
    meta = soup.find("div", {
        "id": "meta"
    }).find("p", mode="first").remove_tags()
    df["position"] = meta.split(": ")[1].split(" •")[0]
    return df
コード例 #27
0
def mick_rijmwoordenboek(word: str, n_words: int):
    url = f"https://rijmwoordenboek.nl/rijm/{word}"
    html = get(url)
    soup = Soup(html)

    results = soup.find("div", {
        "id": "rhymeResultsWords"
    }).html.split("<br />")

    # clean up
    results = [r.replace("\n", "").replace(" ", "") for r in results]

    # filter html and empty strings
    results = [r for r in results if ("<" not in r) and (len(r) > 0)]

    return random.sample(results, min(len(results), n_words))
コード例 #28
0
ファイル: projections.py プロジェクト: rush175/sticky
def capfriendly():
    df = pd.DataFrame()
    for page in range(1, 10 + 1):
        url = f'https://www.capfriendly.com/browse/active/2020/salary&hide=team,clauses,position,handed,expiry-status,caphit,skater-stats,goalie-stats&p={page}'
        html = get(url)
        soup = Soup(html)
        pdf = pd.read_html(str(soup.find('table')))[0]
        df = df.append(pdf)
        time.sleep(0.5)
    df['PLAYER'] = df['PLAYER'].apply(
        lambda x: re.split("\d{1}|\d{2}|\d{3}", x)[-1].replace('. ', ''))
    df['SALARY'] = df['SALARY'].apply(
        lambda x: x.replace('$', '').replace(',', ''))
    df['SALARY'] = df['SALARY'].apply(float)
    df.columns = ['name', 'age', 'salary']
    df = df.reset_index(drop=True)
    return df
コード例 #29
0
ファイル: match.py プロジェクト: scrambldchannel/pycricinfo
 def json(self) -> dict:
     """
     The JSON file for this match
     """
     if self.json_file:
         with open(self.json_file, "r") as f:
             return json.loads(f.read())
     else:
         try:
             return get(self.json_url)
         except HTTPError as e:
             if e.code == 404:
                 raise PageNotFoundException(
                     e.code,
                     f"Match {self.id} not found. Check that the id is correct.",
                 )
             raise PyCricinfoException(e.code, e.message)
コード例 #30
0
ファイル: scrape.py プロジェクト: strawberrysocialist/DE4DS
def download_player_ids():
    players = []
    for letter in tqdm(string.ascii_lowercase):
        if letter == 'x':
            continue
        url = f'https://www.hockey-reference.com/players/{letter}/'
        html = get(url)
        soup = Soup(html)
        strong = soup.find('strong')
        for s in strong:
            try:
                player = s.find('a').attrs['href'].split('.')[0].split('/')[-1]
                players.append(player)
            except:
                pass
        time.sleep(1)
    return players