Ejemplo n.º 1
0
 def parse_home(self):
     session = HTMLSession()
     session.mount('http://', HTTPAdapter(max_retries=3))
     session.mount('https://', HTTPAdapter(max_retries=3))
     print("parsing: " + self.homepage + "/read-the-story/")
     try:
         with session.get(self.homepage + "/read-the-story/", timeout=(5, 10)) as buf:
             chapters = buf.html.find('#chapters', first=True)
             if chapters == None:
                 return
             chapter_list = chapters.find('.chapter__box')
             for chapter in chapter_list:
                 url = chapter.links.pop()
                 name = re.sub(r'Chapter [\d]*', '',
                               chapter.full_text.strip())
                 name = name.strip()
                 index = re.search(
                     r'Chapter [\d]*', chapter.full_text.strip())
                 index = index.group()
                 chapter = BookChapter(name, index, url)
                 self.chapters.append(chapter)
     except Exception as e:
         print(e)
     print("finish: " + self.homepage + "/read-the-story/")
     session.close()
Ejemplo n.º 2
0
def getNowPlaying():

    try:
        session = HTMLSession()
        r = session.get(
            'http://composer.nprstations.org/widgets/iframe/now.html?station=50e451b6a93e91ee0a00028e'
        )
        r.html.render()
    except:
        print("Error encountered with page render.\n")
        session.close()
        return {'song': "", 'artist': "", 'program': ""}

    s = r.html.find('li.whatson-songTitle', first=True)
    a = r.html.find('li.whatson-songArtist', first=True)
    p = r.html.find('a.whatson-programName', first=True).text

    if s:
        s = s.text
    else:
        s = ""

    if a:
        a = a.text
    else:
        a = ""

    session.close()

    return {'song': s, 'artist': a, 'program': p}
Ejemplo n.º 3
0
    def _get_html_page(self, url):
        '''_get_html_page : charge la page internet où se trouvent les données à récupérer
            args: url (string)
            return: html_page (requests.Response ou None, voir doc requests python) 

        (1) Instanciation de l'objet qui récupèrera le code html.
        (2) On tente 3 fois de charger la page à l'url donnée, en cas de problème de réseau.
        (3) Si le chargement réussit, on garde la page. Sinon, on la déclare inexistante.
        '''
        # (1)
        html_page = None
        session = HTMLSession()
        i = 0
        # (2)
        while html_page is None and i < 3:
            if i > 0: 
                print("\t retrying ...")
            html_page = session.get(url)
            i += 1
            # (3)
            if html_page.status_code == 200:
                html_page.html.render(sleep=self._waiting, keep_page=True, scrolldown=1)
            else:
                html_page = None

        session.close()

        return html_page
Ejemplo n.º 4
0
def scrape_urls(urls):
    relpath = os.path.dirname(__file__)
    outname = os.path.join(relpath, 'data/vodinfos.csv')
    with open(outname, 'w+') as outfile:
        outfile.write('vodID,Streamer,Category,Views,Length\n')  # header
        for link in urls:
            session = HTMLSession()
            jpage = session.get(link)
            jpage.html.render()

            vodid = link.split('/')[-1]
            info = jpage.html.text
            finfo = info.split('\n')
            streamer = finfo[0].split(' ', 1)[0]
            cat, views, length = None, None, None
            for i, line in enumerate(finfo):
                if line.startswith('Category'):
                    cat = finfo[i + 1]
                elif line.startswith('Total Views'):
                    views = finfo[i - 1].replace(',', '')
                elif line.startswith('00:00:00'):
                    length = finfo[i + 1]
                if cat is not None and views is not None and length is not None:
                    break

            output = f'{vodid},{streamer},{cat},{views},{length}\n'
            print(output)
            outfile.write(output)
            session.close()
            time.sleep(10)  # prevents rate limit
Ejemplo n.º 5
0
 def get_direct_link(link):
     session = HTMLSession()
     session.headers["Accept-language"] = "zh-CN"
     response = session.head(link)
     url = response.headers["Location"]
     session.close()
     return url
Ejemplo n.º 6
0
def mercari(request):
    driver_set()

    file = finders.find('Fake/log/mercari.csv')
    csv_file = open(file, 'w')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Product', 'Link'])

    session = HTMLSession()
    r = session.get(
        'https://www.mercari.com/search/?keyword=clocky%20alarm%20clock')
    r.html.render(timeout=20)

    damn = ["$39.99", "$45.99"]
    article = r.html.find('div.Grid2__Col-mpt2p4-0')
    for i in article:
        aa = i.find('a', first=True)
        title = aa.attrs['alt']
        link = aa.absolute_links
        price = i.find('span', first=True).text
        if price not in damn:
            for li in link:
                csv_writer.writerow([title, li])

    csv_file.close()
    session.close()

    return render(request, 'Fake/mercari.html')
Ejemplo n.º 7
0
def getParkingData():
    session = HTMLSession()

    url = 'https://parkingapps.ucr.edu/spaces/'

    r = session.get(url)

    r.html.render(sleep=1, keep_page=True, scrolldown=1)

    lots = r.html.find('.col-sm-6')
    parking_lots = []

    for i in range(0, len(lots) - 1):
        # Info for each lot is returned as list of strings.
        # All info for single lot is a single string with data seperated with newlines
        # e.g ('Lot 24-\n3:40pm\nCanyon Crest Drive\nFree Spaces\n405\nOccupancy\n0%\n')
        split_string = lots[i].text.split('\n')
        parking_lot = {
            'Parking Lot': split_string[0].replace('-', ''),
            'Time': split_string[1],
            'Free Spaces': split_string[4],
            'Occupancy': split_string[6],
        }
        print(parking_lot)

        parking_lots.append(parking_lot)

    session.close()

    return parking_lots
Ejemplo n.º 8
0
def getResult(parametro, anno):

    filename = parametro + 'tutte' + anno + '.csv'
    filename = os.path.join(path_result, filename)

    url = "https://www.arpa.veneto.it/bollettini/storico/Mappa_" + anno + "_" + parametro + ".htm?t=RG"
    print(url)
    main_page = request.urlopen(url)
    main_page_html = main_page.read()
    main_page.close()
    soup = BeautifulSoup(main_page_html, 'html.parser')

    #costruisco i link salvo html relativo
    mappa = soup.find(id='STAZIONI')
    links = [link['href'] for link in mappa.find_all('area')]
    #print (links)
    html_list = []
    session = HTMLSession()
    for link in links:
        link_result = url_base + link

        #surf to pagina dati
        result = session.get(link_result)
        html_list.append(result.html.html)
        session.close()

    #parsing e scrittura su file
    final_parsing(html_list, anno, parametro, 'tutte', filename)
Ejemplo n.º 9
0
def alibaba(request):
    driver_set()

    file = finders.find('Fake/log/alibaba.csv')
    csv_file = open(file, 'w')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Product', 'Link'])

    session = HTMLSession()
    r = session.get(
        'https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=clocky+alarm+clock+on+wheels'
    )
    r.html.render(timeout=20)

    damn = ["$39.99-$45.99", "$39.99", "$45.99"]
    article = r.html.find('div.organic-gallery-offer-inner')
    for i in article:
        haha = i.find('a.elements-title-normal', first=True)
        title = haha.attrs['title']
        link = haha.attrs['href']
        price = i.find('p.elements-offer-price-normal', first=True)
        price = price.attrs['title']
        if price not in damn:
            csv_writer.writerow([title, link])

    csv_file.close()
    session.close()

    return render(request, 'Fake/alibaba.html')
Ejemplo n.º 10
0
def list_events():
    """Get events of summoners wars."""
    s = HTMLSession()
    r = s.get(
        'https://forum.com2us.com/forum/main-forum/summoner-s-war/events-ab')
    s.close()
    return "\n".join([x.text for x in r.html.find("a.topic-title")[:5]])
Ejemplo n.º 11
0
def get_page(url, recursion=0, retry_after=5):
    if recursion > 2:
        raise GetPageException("Get page method failed too many times.")

    try:
        # Creating a new session
        session = HTMLSession()

        # Loading the page to a variable
        page = session.get(url).html
        session.close()

        return page

    except MaxRetries:
        sleep(retry_after)
        return get_page(url, recursion + 1)

    except ConnectionError:
        sleep(retry_after)
        return get_page(url, recursion + 1)

    except Exception as err:  # We just skip page in this iteration and try to save
        with open("simple_get_page_log.txt", "a", encoding="utf-8") as lf:
            print("[%s] Error message: %s" %
                  (dt.now().strftime("%Y-%m-%d, %H:%M:%S"), err),
                  end="\n%s\n" % ("-" * 20),
                  file=lf)
Ejemplo n.º 12
0
def parsing_tradingview(parsing_asset, parsing_exchange):
    url = "https://www.tradingview.com/symbols/"
    url = url + parsing_exchange + '-' + parsing_asset
    print(url)
    #response = requests.get(url)
    #page = response.text
    #print(page)

    session = HTMLSession()
    r = session.get(url)
    my = r.html.render(timeout=30)
    test = r.html.search('js-symbol-last')

    #soup = BeautifulSoup(my, "html.parser")
    #price = soup.find_all(class_='js-symbol-last')
    #price = soup.prettify()
    #price = soup.select(selector)
    #price = soup.find('tv-symbol-header-quote__value tv-symbol-header-quote__value--large js-symbol-last')

    selector = 'span.tv-symbol-header-quote__value.tv-symbol-header-quote__value--large.js-symbol-last'
    price = r.html.find(selector)[0].text
    r.close()
    session.close()
    try:
        price = float(price)
    except:
        price = 0
    finally:
        return price
Ejemplo n.º 13
0
def get_dept_in_csv(dept,wordlist,csv_filename, transiscope,trans_wordlist):
	twitter_api = twt.get_twitter_api(twitter_auth)
	session_list = HTMLSession()
	liste_communes = scrp.get_dept(session_list,dept)
	session_list.close()
	for commune in liste_communes:
		get_commune_csv(csv_filename,commune,dept,wordlist,trans_wordlist,transiscope,twitter_api)
Ejemplo n.º 14
0
def scraping_for_amazon_asins(asins: list) -> pd.DataFrame:
    columns = ["timestamp", "asin", "price", "title"]
    scraped_articles = pd.DataFrame(columns=columns)

    s = HTMLSession()

    for asin in asins:
        url = "https://www.amazon.de/dp/" + asin
        print(url)

        r = s.get(url=url)
        r.html.render(sleep=2)

        price = r.html.find("#priceblock_ourprice")[0].text
        print(price)

        title = r.html.find("#productTitle")[0].text
        print(title)

        # create new row and append to the dataframe
        new_row = {
            "timestamp": str(datetime.now()),
            "asin": asin,
            "price": price,
            "title": title
        }
        scraped_articles = scraped_articles.append(new_row, ignore_index=True)

    s.close()

    return scraped_articles
Ejemplo n.º 15
0
    def _login(self):

        # auth = HTTPBasicAuth(email, password)
        # s = requests.Session()
        # r = s.options(login_url, data=payload, headers=headers)
        from requests_html import HTMLSession

        login_url = r'https://my.rightmove.co.uk/login'
        page_url = r'https://www.rightmove.co.uk/user/shortlist.html'
        email = '*****@*****.**'
        password = '******'
        data = {'email': email, 'password': password, 'keepMeLoggedIn': 'true'}
        headers = {
            'Content-type': 'application/json',
            'Accept': 'application/json'
        }

        session = HTMLSession()
        s = session.options(url=login_url, data=data, headers=headers)
        r = session.get(url=page_url)
        r.html.render()
        tags = r.html.find('div')

        for t in tags:
            print(t)
        print()
        session.close()
Ejemplo n.º 16
0
def dataExtractor(url):
    keys = [
        'Membership Level', 'Organization', 'First Name', 'Last Name',
        'Address', 'City', 'Province', 'Country', 'Phone', 'Website',
        'Type Of Service', 'Postal Code', 'Description', 'Data_url', 'Email'
    ]
    dictionary = dict.fromkeys(keys)
    session = HTMLSession()

    source = session.get(url)
    source.html.render(sleep=1, timeout=20)
    div_block = source.html.xpath(
        '//*[@class="fieldContainer simpleTextContainer"]')

    #elements = data_block.html.xpath('//*[@class="fieldSubContainer labeledTextContainer"]')

    for div in div_block:
        split_data = div.text.splitlines()
        label = split_data[0].title()
        data = ' '.join([str(elem) for elem in split_data[1:]])

        #print('LABEl:{}'.format(label))
        #print('DATA:{}'.format(data))

        dictionary[label] = data

    dictionary['Data_url'] = url

    session.close()

    return dictionary
Ejemplo n.º 17
0
def get_top_crypto():
    '''Gets the top 100 Cryptocurrencies by Market Cap'''

    session = HTMLSession()

    resp = session.get(
        "https://finance.yahoo.com/cryptocurrencies?offset=0&count=100")

    tables = pd.read_html(resp.html.raw_html)

    df = tables[0].copy()


    df["% Change"] = df["% Change"].map(lambda x: float(x.strip("%").\
                                                          strip("+").\
                                                          replace(",", "")))
    del df["52 Week Range"]
    del df["1 Day Chart"]

    fields_to_change = [x for x in df.columns.tolist() if "Volume" in x \
                        or x == "Market Cap" or x == "Circulating Supply"]

    for field in fields_to_change:

        if type(df[field][0]) == str:
            df[field] = df[field].str.strip("B").map(force_float)
            df[field] = df[field].map(lambda x: x
                                      if type(x) == str else x * 1000000000)

            df[field] = df[field].map(lambda x: x if type(x) == float else
                                      force_float(x.strip("M")) * 1000000)

    session.close()

    return df
Ejemplo n.º 18
0
def _raw_get_daily_info(site):

    session = HTMLSession()

    resp = session.get(site)

    tables = pd.read_html(resp.html.raw_html)

    df = tables[0].copy()

    df.columns = tables[0].columns

    del df["52 Week Range"]

    df["% Change"] = df["% Change"].map(
        lambda x: float(x.strip("%+").replace(",", "")))


    fields_to_change = [x for x in df.columns.tolist() if "Vol" in x \
                        or x == "Market Cap"]

    for field in fields_to_change:

        if type(df[field][0]) == str:
            df[field] = df[field].str.strip("B").map(force_float)
            df[field] = df[field].map(lambda x: x
                                      if type(x) == str else x * 1000000000)

            df[field] = df[field].map(lambda x: x if type(x) == float else
                                      force_float(x.strip("M")) * 1000000)

    session.close()

    return df
Ejemplo n.º 19
0
def pageChangeCheck(df, skipUrl=0):
    """Has the page changed? Returns a tuple of Boolean and r.html object"""
    fetchUrl = f"{baseUrl}{paramUrl}{skipUrl}"
    # skipUrl += 10

    session = HTMLSession()
    r = session.get(fetchUrl)
    logStr = f"Fetch:  {r} - {fetchUrl}"
    logger.info(logStr)
    r.html.render(keep_page=True)
    logStr = f"Render: {r} - {fetchUrl}"
    logger.info(logStr)

    # raise Exception("Halt and Catch Fire")

    session.close()

    # Unused: reads the headline number of readings stored by the site.
    # obj = r.html.find('span.h1.reforma-medium.xs-me-10.dark-blue-txt.ng-binding')
    # countOfReadings = obj[0].text
    obj2 = r.html.find("bdi")
    source = obj2[0].html
    soup = BeautifulSoup(source, "lxml")
    first_date = soup.find("bdi")
    dDate = datetime.strptime(first_date.next, webDateFormat)

    filt = df.index == dDate
    if len(df[filt]) > 0:
        changed = False
    else:
        changed = True

    return changed, r
Ejemplo n.º 20
0
def getProvinciaResult(parametro, provincia,anno):
    regioniDict = getDictProvince(parametro, provincia, anno)
    stazioni_list = []
    filename = parametro + provincia + anno +'.csv'
    filename = os.path.join(path_result, filename)
    
    if (provincia == 'tutte'):
        provincia = ""
        valuesList = regioniDict.values()
        for codice_list in valuesList:
            for codice in codice_list:
                stazioni_list.append(codice)
        
    else:
        stazioni_list = regioniDict[provincia]
    html_list = []    
        
    #costruisco i link salvo html relativo
    session = HTMLSession()
    for e in stazioni_list:
        link = anno + '/' + e + '_' + anno + '_' + parametro + '.htm'
        link_result = url_base + link
        
        #surf to pagina dati           
        result = session.get(link_result)
        html_list.append(result.html.html)
        session.close()
        
        
    #parsing e scrittura su file    
    final_parsing(html_list, anno,parametro,provincia, filename)
Ejemplo n.º 21
0
def get_all_images(url):
    """
    Returns all image URLs on a single `url`
    """
    # initialize the session
    session = HTMLSession()
    # make the HTTP request and retrieve response
    response = session.get(url)
    # execute Javascript
    response.html.render()
    # construct the soup parser
    soup = bs(response.html.html, "html.parser")
    urls = []
    for img in tqdm(soup.find_all("img"), "Extracting images"):
        img_url = img.attrs.get("src") or img.attrs.get("data-src")
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(url, img_url)
        # remove URLs like '/hsts-pixel.gif?c=3.2.5'
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        # finally, if the url is valid
        if is_valid(img_url):
            urls.append(img_url)
    # close the session to end browser process
    session.close()
    return urls
Ejemplo n.º 22
0
def run():
    try:
        while 5 > 1:
            #webUrl = urllib.request.urlopen('https://keys.lol/ethereum/1')
            pageNum = random.randrange(1,904625697166532776746648320380374280100293470930272690489102837043110636675)
            fullurl = 'https://keys.lol/ethereum/'+str(pageNum)
            session = HTMLSession()
            webUrl = session.get(fullurl)
            #time.sleep(10)
            webUrl.html.render()
            #html = webUrl.content
            data = webUrl.html.html
            result = re.findall("[+-]?\d+\.\d+", str(data))
            for i in result:
                if str(i) + " eth" in str(data):
                    if float(i) > 0:
                        with open('ValidWalletsETH.txt', 'a') as appendFile:
                            appendFile.write('{} eth\n'.format(str(i)))
                            appendFile.write('{}\n'.format(fullurl))
            global pages
            pages = pages + 1
            sys.stdout.write("\rPages read: {}".format(str(pages)))
            sys.stdout.flush()
            session.close()
            webUrl.close()
    except:
        run()
Ejemplo n.º 23
0
def mangalist():
    options = {'type': 'latest', 'category': 'all', 'state': 'all'}
    url = base + '/manga_list'
    session = HTMLSession()
    data = session.get(url, params=options)
    sel = 'body > div.container > div.main-wrapper > div.leftCol.listCol > div > div.panel_page_number > div.group_page > a.page_blue.page_last'
    list_range = data.html.find(sel, first=True).search('Last({})')[0]
    session.close()
    manga_data = {}
    for item in range(2):
        try:
            data = session.get(url, params=options)
            data.html.render()
        except Exception as e:
            print(e)
            print('Going to sleep')
            time.sleep(5)
        sel = 'body > div.container > div.main-wrapper > div.leftCol.listCol > div'
        manga_list = data.html.find(sel, first=True)
        for manga in manga_list.find('div.list-truyen-item-wrap'):
            detail = manga.find('a', first=True)
            manga_name = detail.attrs['title']
            print('Finding the details of ' + manga_name)
            manga_data[manga_name] = {}
            manga_data['link'] = detail.attrs['href']
            manga_data['icon'] = detail.find('img', first=True).attrs['src']
    scraperwiki.sql.save(manga_data, table_name='manga_data')
Ejemplo n.º 24
0
def _download_house(url_house):
    session = HTMLSession()
    response = session.get(URL_BASE + url_house)
    response.html.render()
    response = BeautifulSoup(response.text, features="html.parser")
    session.close()
    return response
Ejemplo n.º 25
0
def _parse_table(url):
    session = HTMLSession()
    r = session.get(url)

    rows = r.html.find("div[data-test='fin-row']")

    info = [row.text.split("\n") for row in rows]
    clean = [[inner.replace(",", "") for inner in outer] for outer in info]

    indexes = [[ix for ix, elt in enumerate(row) if re.search("[a-z]", elt)] for row in clean]

    fixed = []
    for ix_list, nums in zip(indexes, clean):
        if len(ix_list) == 1:
            fixed.append(nums)
        else:
            actual_ix = ix_list[1:]

            to_add = [nums[actual_ix[i]:actual_ix[i + 1]] for
                      i in range(len(actual_ix) - 1)]

            # for ix in range(len(to_add)):
            #    to_add[ix][0] = nums[0] + "-" + to_add[ix][0]        

            fixed.extend(to_add)

    table = pd.DataFrame(fixed).drop_duplicates().reset_index(drop=True)

    headers = [span.text for span in r.html.find("div[class='D(tbhg)'] span")]

    table.columns = headers

    session.close()

    return table
Ejemplo n.º 26
0
def get_imicrobe_acc_metadata(pacc):
    """
    Function to get list of iMicrobe sample accession numbers from a particular
    project. Takes project accession number `pacc` and returns a list of iMicrobe
    accession numbers.
    """
    # Check accession format
    pacc = pacc.lower()
    if pacc.startswith("p"):
        pacc = pacc[1:]
    elif pacc.startswith("s"):
        return [pacc]
    else:
        raise(Exception("iMicrobe accession numbers should be prefixed with 'p' (project) or 's' (sample)"))

    # Grab sample info
    session = HTMLSession()
    r = session.get('https://www.imicrobe.us/#/projects/'+pacc)
    r.html.render(sleep = 1)

    sample_list = []
    for l in r.html.element("a"):
        i = l.items()
        try:
            if i[0][1].startswith("#/samples/"):
                sample_list.append(i[0][1][10:]) # add sample ID only
        except IndexError:
            continue
    session.close()

    # Format and return sample accession numbers
    return ["s"+ sID for sID in sample_list]
Ejemplo n.º 27
0
class Quoter:
    def __init__(self):
        self.start_session()
        self.loggedin = False

    def start_session(self):
        self.session = HTMLSession()
        self.active = True

    def close_session(self):
        self.session.close()
        self.active = False

    async def part(self, pn: str):
        info = quoter_queue.enqueue(atparts.get_part, self.session, pn,
                                    self.loggedin, True)
        job = Job.fetch(info.id, connection=redis_conn)
        job_status = job.get_status()
        while job_status != 'finished':
            if job_status == 'failed':
                return None
            job = Job.fetch(info.id, connection=redis_conn)
            job_status = job.get_status()
        info = job.result

        if info:
            info = info._asdict()
            info['vendor'] = 'Air Tractor'
        else:
            info = {'pn': pn, 'desc': 'Not Found'}
        return info
Ejemplo n.º 28
0
def test_browser_session():
    """ Test browser instaces is created and properly close when session is closed.
        Note: session.close method need to be tested together with browser creation,
            since no doing that will left the browser running. """
    session = HTMLSession()
    assert isinstance(session.browser, Browser)
    assert hasattr(session, "loop") == True
    session.close()
Ejemplo n.º 29
0
 def scrap_calendar_page(self):
     print('Getting calendar page')
     session = HTMLSession()
     r = session.get('http://titania.saeima.lv/LIVS13/SaeimaLIVS2_DK.nsf/DK?ReadForm&calendar=1')
     print('Rendering calendar page')
     r.html.render(timeout=80000)  # this call executes the js in the page
     self.scrapped_calendar_page = r
     session.close()
 def get_img_link(link):
     session = HTMLSession()
     res = session.get(link)
     res.html.render(timeout=30 * 100, keep_page=True)
     images = res.html.find("img.fotorama__img")
     image_link = images[0].attrs['src']
     session.close()
     return image_link