Python Browser.get_page_source Examples, webbot.Browser.get_page_source Python Examples

Example #1

0

Show file

    def StartScraping(self):
        # Login to amazon account
        username = self.txtusername.text()
        password = self.txtpassword.text()
        web = Browser()
        web.go_to('https://affiliate-program.amazon.com/home/promohub/promocodes?ac-ms-src=nav&type=mpc&active_date_range=0')
        web.type(username , into='Username' , id='ap_email')
        web.type(password , into='Password' , id='ap_password')
        web.click(classname='a-button-inner') # you are logged in . oohoooo
        time.sleep(60)
		#end login

		# Reading CSV file
		
# Open the input_file in read mode and output_file in write mode
       	with open('product-data-input.csv', 'r') as read_obj, \
        		open('product-data-output.csv', 'w', newline='') as write_obj:
    # Create a csv.reader object from the input file object    		
            csv_reader = reader(read_obj)
    # Create a csv.writer object from the output file object
            csv_writer = writer(write_obj)
            line_count = 0
    # Read each row of the input csv file as list
            for row in csv_reader:
                if line_count == 0:
                    row.append("Full URL")
                    csv_writer.writerow(row)
                    line_count += 1
        # Append the default text in the row / list
                else:
                    self.lblcurrent.setText(str(line_count))
                    web.go_to(row[4])
					# time.sleep(5)
                    content = web.get_page_source()
                    soup = BeautifulSoup(content)
                    samples = soup.find_all("a", {"class":"a-size-base-plus a-link-normal titleLink"})
                    link = "aasdf"
                    for tag in samples:
                        link = "https://www.amazon.com"+tag['href']
                        web.go_to(link)
                        break
                    time.sleep(5)
                    web.click( "Text", 'a')
                    time.sleep(3)
                    web.click( "Full Link", 'span')
                    content = web.get_page_source()
                    soup = BeautifulSoup(content)
                    samples = soup.find_all("textarea", {"class":"amzn-ss-text-fulllink-textarea"})
                    for textareas in samples:
                        row.append(textareas.text)
        # Add the updated row / list to the output file
                        csv_writer.writerow(row)
                        line_count += 1

Example #2

0

Show file

File: get_ebird_profile.py Project: bentocg/Insight

def get_profile(user_data, out: str):
    """
    Gets user profile from ebird. Users webbot to login to ebird.org and searches for user profile address
    on sample checklist entry for user.

    :param checklist_url: path to a checklist url, extracted from users dataframe column
    :param out: path to output text file
    :return: None
    """
    user_name = user_data[1].split('/')[0].strip()
    checklist_url = user_data[0]
    web = Browser(showWindow=False)
    web.go_to("https://secure.birds.cornell.edu/cassso/login")
    web.type('birds_of_a_feather', into='Username')
    web.type('y&2m#9B3B2NGGzp', into='Password')
    web.press(web.Key.ENTER)
    web.go_to(checklist_url)
    source = web.get_page_source()
    soup = BeautifulSoup(source, 'lxml')
    try:
        url = soup.find('a',
                        {'title': f'Profile page for {user_name}'})['href']
        profile = f"https://ebird.org/{url}"
        with open(out, 'a') as src:
            src.write(f"{checklist_url}_{profile}\n")
    except TypeError:
        with open(out, 'a') as src:
            src.write(f"{checklist_url}_None\n")

Example #3

0

Show file

File: searcher.py Project: ZIO1993/Logilux-search

def getPage():
    global queries
    global soup

    print("Scarico la pagina")
    web = Browser()
    web.go_to(url_login)
    web.type(email, into='E-mail')
    web.type(password, into='Password')
    web.click('ACCEDI', classname='submit_access')
    time.sleep(delay)
    web.click('OFFERTE DI LAVORO')
    time.sleep(delay)
    page = web.get_page_source()
    web.close_current_tab()

    soup = BeautifulSoup(str(page), 'html.parser')

    print("Cerco il box degli annunci")
    soup = soup.find("div", id="js-grid-blog-posts")

    print("Inzio a filtrare i risultati per regione")
    global regioni_cercate
    for reg in regioni_cercate:
        print("Filtro per: " + reg)
        filter(soup, reg)
    print("Ho concluso l'esecuzione")

Example #4

0

Show file

File: lb2120.py Project: pjaos/lb2120

    def run(self):
        """@brief A thread that reads stats from the LB2120 4G modem"""
        web = Browser()
        web.go_to('http://{}/index.html'.format(self._options.address))
        web.click(id='session_password')
        web.type(self._password)
        #web.type('QtV6Dq4s')
        web.click('Sign In')
        web.click(id='session_password')
        startTime = time()
        while True:
            web.go_to("http://{}/index.html#settings/network/status".format(
                self._options.address))
            content = web.get_page_source()
            now = time()
            elapsedTime = now - startTime
            startTime = now
            self._pollSeconds = elapsedTime

            soup = BeautifulSoup(content, 'html.parser')
            item = soup.body.find(
                'dd', attrs={'class': 'm_wwan_signalStrength_rsrp'})
            self._lock.acquire()
            self._rxp = float(item.string)
            self._uio.debug("4G RXP (dBm): {}".format(self._rxp))
            self._lock.release()
            sleep(LB2120.POLL_DELAY_SECONDS)

Example #5

0

Show file

def webot(user, passwd):
    web = Browser(False)
    web.go_to("https://midas.unioeste.br/login/#/")
    time.sleep(5)
    web.type(user, id="login-username")
    web.type(passwd, id="login-password")
    web.press(web.Key.ENTER)
    time.sleep(5)
    web.click('Academus')
    time.sleep(5)
    web.click('Matrículas')
    time.sleep(3)
    data = web.get_page_source()
    web.close_current_tab()
    return data

Example #6

0

Show file

def getJapaTalkCalendar(userName, password, useCache=False):
    if useCache and os.path.exists(CACHED_FILE_FN):
        with io.open(CACHED_FILE_FN, 'r', encoding="utf-8") as fd:
            pageSource = fd.read()
    else:
        web = Browser()
        web.go_to('https://www.japatalk.com/login_form.php')
        web.click(id="wID")
        web.type(userName)
        web.click(id="wPasswd")
        web.type(password)
        web.click(classname="btn-next")
        #web.click(classname="from-cal")
        web.go_to('https://www.japatalk.com/reservation_calendar.php')
        pageSource = web.get_page_source()

        if useCache:
            with io.open(CACHED_FILE_FN, 'w', encoding="utf-8") as fd:
                fd.write(pageSource)

    return pageSource

Example #7

0

Show file

        def download_file_rj(music_or_video, file_type, regex_file, ch_actions):
            context.bot.send_message(chat_id=chat_id, text="کمی صبر کنید...")
            if res != "inv":
                web = Browser()
                web.go_to(url)
                s = web.get_page_source()
                web.close_current_tab()
                soup = BeautifulSoup(s, 'html.parser')
                # finde mp3 link
                file_name = str(re.findall(fr"{regex_file}", str(soup)))
                file_name = file_name.replace("['", "")
                file_name = file_name.replace("']", "")
                file_url = f"https://host2.rj-mw1.com/{file_type}{file_name}.mp{music_or_video}"
                req = urllib.request.Request(file_url)
                with urllib.request.urlopen(req) as response:
                    the_file_url_page = str(response.read())
                if the_file_url_page != "b'Not found'":
                    wget.download(file_url, f'{file_name}.mp{music_or_video}')
                else:
                    try:
                        os.remove(f"{file_name}.mp{music_or_video}")
                    except:
                        pass
                    file_url = f"https://host1.rj-mw1.com/{file_type}{file_name}.mp{music_or_video}"
                    wget.download(file_url, f'{file_name}.mp{music_or_video}')
                file_caption = str(file_name) #name fixed
                file_caption = file_caption.replace("-"," ")
                if str(file_name) == "[]":
                    context.bot.send_chat_action(chat_id, ChatAction.TYPING)
                    context.bot.send_message(chat_id=chat_id, text="لینک اشتباه است. \n\n لطفا لینک آهنگ یا موزیک ویدیوی مورد نظر را از رادیو جوان بفرستید.")
                else:
                    if ch_actions == "music":
                        context.bot.send_chat_action(chat_id, ChatAction.UPLOAD_AUDIO)
                        context.bot.send_audio(chat_id=chat_id, audio=open(f"./{file_name}.mp{music_or_video}", "rb"), caption=f"{file_caption}")
                    elif ch_actions == "video":
                        context.bot.send_chat_action(chat_id, ChatAction.UPLOAD_VIDEO)
                        context.bot.send_video(chat_id=chat_id, video=open(f"./{file_name}.mp{music_or_video}", "rb"), caption=f"{file_caption}")

                if os.path.exists(f"{file_name}.mp{music_or_video}"):
                    os.remove(f"{file_name}.mp{music_or_video}")

Example #8

0

Show file

user=input("user ")
passw=input("password ")
from webbot import Browser
web=Browser()
link="https://lz95.instructure.com/login/canvas"
id="pseudonym_session_password"
web.go_to(link)
web.type(user)
web.click(id=id)
web.type(passw)
web.press(web.Key.ENTER)
sauce=web.get_page_source()
print(sauce.count("Course card color region"))
l=sauce.split("Course card color region")
n=sauce.count("Course card color region")
links=[]
for a in range(n):
 links.append(l[a+1].split('href="')[1].split('"')[0])


s="https://lz95.instructure.com/"
print(links)
for link in links:
 site=s+link
 web.go_to(site)


web.quit()
import time
print("finished")
time.sleep(1)

Example #9

0

Show file

    web.go_to(target)

    id = 'sc-alm-buy-box-ptc-button-VUZHIFdob2xlIEZvb2Rz'
    web.click(id=id, tag='input')

    #id = 'a-autoid-0'
    #web.click(id=id,tag='input')

    pyautogui.click(1007, 257)
    #print(pyautogui.position())

    id = 'subsContinueButton'
    web.click(id=id, tag='input')
    time.sleep(15.0)
    html = web.get_page_source()

    findSpot = False
    while not findSpot:
        #if 'No delivery' in html:
        if not available(html):
            print('Not Available yet')
            pyautogui.click(154, 77)
            time.sleep(15.0)
            html = web.get_page_source()
        else:
            client = boto3.client('sns')
            msg = "A slot is available, go get it"
            pn = "+1"
            client.publish(PhoneNumber=pn, Message=msg)
            print(msg)

Example #10

0

Show file

async def update_database():
    my_file = 'classes.csv'

    # check if file exists 
    if os.path.exists(my_file):
        os.remove(my_file)

        # Print the statement once the file is deleted  
        print(f'An existing {my_file} has been found. The file: {my_file} will be deleted.')
    else:
        print(f'The file: {my_file} does not exist. Creating file now...')

    web = Browser()
    web.go_to('http://aisis.ateneo.edu/')
    web.click('click here')

    mirror = web.get_current_url().split('j_aisis')

    # enters credentials into the site
    ID_NUM = os.getenv('ID_NUM')
    PASSWORD = os.getenv('PASSWORD')

    web.type(ID_NUM, into='userName')
    web.type(PASSWORD, into='password')

    web.click('Sign in') # successfully signs into AISIS
    # web.click('CLASS SCHEDULE')
    web.go_to(f'{mirror[0]}j_aisis/J_VCSC.do')

    # html parsing
    page_soup = soup(web.get_page_source(), "html.parser")

    filename = 'subjects.csv'
    f = open(filename, 'w')

    headers = 'subject_code,section,course_title,units,time,room,instructor,max_no,lang,level,free_slots,remarks,s,p\n'

    f.write(headers)

    # grabs each product
    departments = page_soup.findAll(
        lambda t: t.name == 'option' and t.parent.attrs.get('name') == 'deptCode'
    )

    subjects = []
    subject_info = []

    i = 1

    for dept in departments:
        web.click('Display Class Schedule')
        page_soup = soup(web.get_page_source(), "html.parser")
        raw_data = page_soup.findAll('td', {'class':'text02'}) # gets the info in the subjects table
        j = 0 # column counter
        
        for data in raw_data:
            if j == 14:
                j = 0
                subjects.append(subject_info)
                subject_info = []
                f.write('\n')
            f.write(data.text.replace(',', '|') + ',')
            subject_info.append(data.text)
            j += 1

        f.write('\n')
        
        if i < len(departments):
            web.click(departments[i].text)
            i += 1

    f.close()

    databasefile = 'subjects.csv'

    # rearranges the csv file for more optimal searching
    db = pd.read_csv(databasefile, skip_blank_lines=True, names=['subject_code', 'section', 'course_title', 'units', 'time', 'room', 'instructor', 'max_no', 'lang', 'level', 'free_slots', 'remarks', 's', 'p'])

    # removes space in subject code
    db['subject_code'] = db['subject_code'].apply(remove_space)

    db.sort_values(by=['subject_code'], ascending=True).to_csv(databasefile, index=False)

Example #11

0

Show file

File: levl 4.py Project: tanmay380/AllProjects

from webbot import Browser
import re

web = Browser()
num = str(16044 / 2)

url = 'http://www.pythonchallenge.com/pc/def/linkedlist.php?nothing='

web.go_to(url + num)

patter = re.compile('and the next nothing is sfdsg (\d+)')
times = 0
while True:
    tmp = web.get_page_source()
    match = patter.search(tmp)
    new = match.group(1)
    if match == None:
        break
    web.go_to(url + new)

Example #12

0

Show file

import smtplib
import requests
from webbot import Browser
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

web = Browser()  # open a new window
web.go_to('https://www.tucan.tu-darmstadt.de')  # go to the url
web.type('your username', into='usrname', id='field_user')  # enter username
web.click('NEXT', tag='span')  # tab to go to password field
web.type('your password', into='pass', id='field_pass')  # enter password
web.click('Anmelden')  # login
web.click('Prüfungen')  # go to the tab Prüfungen
web.click('Leistungsspiegel')  # go to the tab Leistungsspiegel

htmlsource = web.get_page_source()  # Download the page source

myList = []  # a list to add elements
cmpList = [
]  # a list to compare the above list with in order to check for changes
available = False

with open('tucan.txt') as file:  # save a .txt file first and then read it
    data = file.read()

soup = bs4.BeautifulSoup(
    htmlsource, features="lxml")  # get all the content from htmlsource
everything = soup.find_all(
    "td"
)  # find all tags with td and return. td contains all the information from the Leistungsspiegel

Example #13

0

Show file

File: getRevenue.py Project: sourabhlal/nba_dataScraping

def extractData(url):
	try:
		print ("TIMESTAMP", url[28:-43])

		web = Browser()
		web.go_to(url)
		time.sleep(7)

		data = web.get_page_source()
		soup = BeautifulSoup(data, features="lxml")

		year = soup.find("li", class_="ranking").a.contents

		print ("YEAR", year[0][:4])

		table = soup.select("#list-table-body")


		scroll_counter = 8

		while len(table[0].findAll('tr')) < 30:
			web.scrolly(1000)
			time.sleep(2)
			data = web.get_page_source()
			soup = BeautifulSoup(data, features="lxml")
			table = soup.select("#list-table-body")
			
			scroll_counter-= 1

			if scroll_counter == 0:
				web.quit()
				return []

		web.quit()

		rows = table[0].findAll('tr')

		ret_list = []

		for r in rows:

			cols = r.findAll('td')
			if 'ad' in cols[0].get("class"):
				continue
			else:
				row_data = {}
				row_data['year'] = year[0][:4]
				row_data['timestamp'] = url[28:-43]
				#current rank
				row_data['rank'] = cols[1].contents[0][1:]

				#teamname
				row_data['name'] = cols[2].a.contents[0]

				#current value
				row_data['val'] = clean_numbers(cols[3].contents[0])

				#one year value change percentage
				row_data['oneyear'] = clean_percentage(cols[4].contents[0])

				#debt value
				row_data['debt'] = clean_percentage(cols[5].contents[0])

				#revenue
				row_data['revenue'] = clean_numbers(cols[6].contents[0])

				#income
				row_data['income'] = clean_numbers(cols[7].contents[0])

				ret_list.append(row_data)

		return ret_list

	except:	
		web.quit()
		return []

Example #14

0

Show file

from bs4 import BeautifulSoup
import requests
from webbot import Browser

debug = True
quoteString = 'XOM'
webLink = 'https://finance.yahoo.com/quote/' + quoteString + '/sustainability'

browser = Browser(debug)
browser.go_to(webLink)
source = browser.get_page_source()
#source = requests.get(webLink).text

soup = BeautifulSoup(source, features="html.parser")
print(soup.prettify())
sus = soup.find_all("div", {"data-reactid":'20'})
print(sus)

Example #15

0

Show file

File: web_check.py Project: thomascong121/Web_checker

from urllib import request
import time
import re
import os
from bs4 import BeautifulSoup, Tag, CData
from webbot import Browser

while True:
    web = Browser()
    web.go_to("https://my.unsw.edu.au/active/studentClassEnrol/courses.xml")
    web.click('Sign on')
    web.type('z5151465', into='USER ID')
    web.type("Sword450869241_", into='Password', id='passwordFieldId')
    web.click('Agree and sign on')
    web.click('Sign on')
    web.click('My Student Profile')
    web.click('Update Your Enrolment')
    web.click('Update Enrolment')
    web.click('Term 2 2019')
    content = web.get_page_source()
    if "Course GSOE9340 is full." in content:
        os.system('say "WOW WOW WOW."')
    web.quit()
    time.sleep(5)

Example #16

0

Show file

File: main.py Project: MaxMcElhone/FluffyScraper

print('Enter your sti password:'******'password:'******'a'):
    if "/ICS/Academics/CIS" in link.get('href'):
        coursework_pages.append(link.get('href'))

#first attempt at getting getting assignments
for page in coursework_pages:
    web.go_to(root_url + page + "Coursework.jnz")
    soup = BeautifulSoup(web.get_page_source(), "html5lib")
    for data in soup.select('.sidebar-link-title a'):
        print(data.text)
    print(soup.find(id='pg0_V__dueNext__rptDueNext_ctl00__hypAssign'))
    print(soup.find(id='pg0_V__dueNext__rptDueNext_ctl01__hypAssign'))

#pyforms web (yesssssssssssss)

Example #17

0

Show file

#tao se la come doblada
# - - - - - - - - - - Set-up - - - - - - - - - -
time_start = time.time()
time_load_search = 3
time_load_article = 2
date_today = dt.date.today()
# data frame & browser
db = pd.DataFrame(
    columns=["anime_title", "anime_link", "anime_op_list", "anime_ed_list"])
link_base = 'https://myanimelist.net/topanime.php?type=airing'

# - - - - - - - - - - Webbot Navigation - - - - - - - - - -
web = Browser()
web.go_to(link_base)
time.sleep(time_load_search)
content = web.get_page_source()  #Saca la código fuente de la página
soup = BeautifulSoup(
    content,
    'html.parser')  #Acá le defino a soup que lo que estoy viendo es un HTML

ranking_lists = soup.find_all(
    "tr", {"class": "ranking-list"}
)  #el "tr" y "class" lo saco de la pagina web y acá le digo que buscar. Para este caso lo que buscabamos era eso

for anime in ranking_lists:
    # title, link
    anime_title = anime.find("div", {"class": "di-ib"}).getText()
    anime_link = anime.find("div", {
        "class": "di-ib"
    }).find("a")["href"]  #href siempre es la fuente del link
    # - - - anime page

Example #18

0

Show file

repo = g.get_repo(repository)
prTitle = repo.get_pull(int(prNum))

fullPath = 'https://github.com/' + repository + '/pull/' + prNum

web = Browser()
#web.driver.set_window_position(-10000, 0)
web.go_to('https://github.com')
web.click('Sign in')
web.type(user, into='Username or email address')
web.click('NEXT', tag='span')
web.type(password, into='Password')
web.click('Sign in', tag='span')
print(prNum)
web.go_to(fullPath)
a = web.get_page_source()
b = str(a)

if b.find("Subscribe to our newsletter") != -1:
    # this means it's a 404
    # theoretically this could change I guess
    notify(message='PR %s does not exist' % prNum)
    sys.exit(0)

while True:
    web.go_to(fullPath)
    a = web.get_page_source()
    b = str(a)
    web.driver.minimize_window()

    if b.find("Some checks haven’t completed yet") != -1:

Example #19

0

Show file

File: lb2120_mbps.py Project: pjaos/lb2120

    def run(self):
        """@brief A thread that reads stats from the LB2120 4G modem"""
        web = Browser()
        web.go_to('http://{}/index.html'.format(self._options.address))
        web.click(id='session_password')
        web.type(self._password)
        #web.type('QtV6Dq4s')
        web.click('Sign In')
        web.click(id='session_password')
        startTime = time()
        lastTotalData = -1
        lastDataRX = -1
        lastDataTX = -1
        self.running = True
        while self.running:
            try:
                web.go_to(
                    "http://{}/api/model.json?internalapi=1&x=11228".format(
                        self._options.address))
                content = web.get_page_source()
                now = time()
                elapsedTime = now - startTime
                startTime = now

                #Remove html text from the response
                jsonContent = content.replace(
                    "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head></head><body><pre style=\"word-wrap: break-word; white-space: pre-wrap;\">",
                    "")
                jsonContent = jsonContent.replace("</pre></body></html>", "")

                #Convert json text to a dict
                data = json.loads(jsonContent)

                #Grab the values associated with throughput
                dataRX = int(data['wwan']['dataTransferredRx'])
                dataTX = int(data['wwan']['dataTransferredTx'])
                tempC = float(data['general']['devTemperature'])
                devTempCritical = data['power']['deviceTempCritical']

                if lastDataRX != -1:
                    if dataRX < lastDataRX:
                        print("<<<<<<<<<< dataRX: {} < {}".format(
                            dataRX, lastDataRX))
                    if dataTX < lastDataTX:
                        print("<<<<<<<<<< dataTX: {} < {}".format(
                            dataTX, lastDataTX))

                    deltaDataRX = dataRX - lastDataRX
                    deltaDataTX = dataTX - lastDataTX

                    downLoadBps = (deltaDataRX / elapsedTime) * 8
                    upLoadBps = (deltaDataTX / elapsedTime) * 8
                    downLoadMBps = float(downLoadBps) / 1E6
                    upLoadMBps = float(upLoadBps / 1E6)

                    lb2120Stats = LB2120Stats()
                    lb2120Stats.downMbps = downLoadMBps
                    lb2120Stats.upMbps = upLoadMBps
                    lb2120Stats.tempC = tempC
                    lb2120Stats.tempCrticial = devTempCritical
                    lb2120Stats.sampleTime = datetime.datetime.now()

                    self._queue.put(lb2120Stats)

                #Save the last results for use next time around
                lastDataRX = dataRX
                lastDataTX = dataTX

            except:
                lines = traceback.format_exc().split('\n')
                for l in lines:
                    self._uio.error(l)

            sleep(self._options.psec)

Example #20

0

Show file

File: scrapeTitlesFromAdmin_noCredentials.py Project: giulioscutari/web-scraping-2

import csv

# credentials: they are not real credentials as this code is to be published.
username = "******"
password = "******"
# log into admin page
web = Browser()
web.go_to('https://www.zumostraining.co.uk/Login.aspx')
web.type(username)
web.click(id='ContentPlaceHolder1_txtPassword')
web.type(password)
web.press(web.Key.ENTER)
# class="btn btn-primary btn-block top-margin-sm text-left"
# navigate to titles page, create soup and select values
web.go_to('https://www.zumostraining.co.uk/Zumos_Admin/Titles.aspx')
soup = BeautifulSoup(web.get_page_source(), 'lxml')
values = soup.find_all(
    'input', {'class': 'btn btn-primary btn-block top-margin-sm text-left'})

# print(values) # test - success
links = ['nothing'] * len(values)
i = 0
print("beginning the loop into every button...")
for value in values:  # loops into every button
    web.click(id=value['id'])
    time.sleep(
        1
    )  # this is needed to make the browser get on the new page before getting the source. It would be a nice improvement to have it done as a webbot function, but I can't find that.
    rawpage = web.get_page_source()
    # print(rawpage)
    titlePage = BeautifulSoup(rawpage, 'lxml')

Example #21

0

Show file

def input_url(update: Update, context:CallbackContext):
    while True:
        chat_id = update.message.chat_id
        # input url

        url = update.message.text

        # --------------------
        # check url
        url_check_regex = re.findall(r"(www\.radiojavan\.com/mp3s/mp3/)", url)
        url_check_regex_app = re.findall(r"(rj\.app/m/)", url)
        url_check_regex_podcast_app = re.findall(r"rj\.app/p/", url)
        url_check_regex_podcast = re.findall(r"www\.radiojavan\.com/podcasts/podcast/", url)
        url_check_regex_video = re.findall(r"www\.radiojavan\.com/videos/video/", url)
        url_check_regex_video_app = re.findall(r"rj\.app/v/", url)
        url_check_regex_playlist = re.findall(r"www\.radiojavan\.com/playlists/playlist/", url)
        url_check_regex_playlist_app = re.findall(r"rj\.app/pm/", url)
        list_url = [
            url_check_regex,
            url_check_regex_app,
            url_check_regex_podcast,
            url_check_regex_podcast_app,
            url_check_regex_video,
            url_check_regex_video_app,
            url_check_regex_playlist,
            url_check_regex_playlist_app
        ]
        what_is_link_type = ""
        count = 0
        for check_url_link_list in list_url:
            if str(check_url_link_list) != "[]" :
                url = url
                what_is_link_type = check_url_link_list
            else:
                count += 1
        res = ""


        if count == 6:
            context.bot.send_chat_action(chat_id, ChatAction.TYPING)
            context.bot.send_message(chat_id=chat_id, text="لینک اشتباه است. \n\n لطفا لینک آهنگ یا موزیک ویدیوی مورد نظر را از رادیو جوان بفرستید.")
            res = "inv"
            break

        # try to download
        files_url = {
            "music": "media/mp3/mp3-256/",
            "podcast": "media/podcast/mp3-192/",
            "video_lq": "media/music_video/lq/",
            "video_hd": "media/music_video/hd/",
            "video_hq": "media/music_video/hq/",
            "video_4k": "media/music_video/4k/"
        }
        regex_music_and_video = {
            "music": "RJ\.currentMP3Perm\ =\ \'(.*)\'\;",
            "video": "RJ\.videoPermlink\ =\ \'(.*)\'\;"
        }
        def download_file_rj(music_or_video, file_type, regex_file, ch_actions):
            context.bot.send_message(chat_id=chat_id, text="کمی صبر کنید...")
            if res != "inv":
                web = Browser()
                web.go_to(url)
                s = web.get_page_source()
                web.close_current_tab()
                soup = BeautifulSoup(s, 'html.parser')
                # finde mp3 link
                file_name = str(re.findall(fr"{regex_file}", str(soup)))
                file_name = file_name.replace("['", "")
                file_name = file_name.replace("']", "")
                file_url = f"https://host2.rj-mw1.com/{file_type}{file_name}.mp{music_or_video}"
                req = urllib.request.Request(file_url)
                with urllib.request.urlopen(req) as response:
                    the_file_url_page = str(response.read())
                if the_file_url_page != "b'Not found'":
                    wget.download(file_url, f'{file_name}.mp{music_or_video}')
                else:
                    try:
                        os.remove(f"{file_name}.mp{music_or_video}")
                    except:
                        pass
                    file_url = f"https://host1.rj-mw1.com/{file_type}{file_name}.mp{music_or_video}"
                    wget.download(file_url, f'{file_name}.mp{music_or_video}')
                file_caption = str(file_name) #name fixed
                file_caption = file_caption.replace("-"," ")
                if str(file_name) == "[]":
                    context.bot.send_chat_action(chat_id, ChatAction.TYPING)
                    context.bot.send_message(chat_id=chat_id, text="لینک اشتباه است. \n\n لطفا لینک آهنگ یا موزیک ویدیوی مورد نظر را از رادیو جوان بفرستید.")
                else:
                    if ch_actions == "music":
                        context.bot.send_chat_action(chat_id, ChatAction.UPLOAD_AUDIO)
                        context.bot.send_audio(chat_id=chat_id, audio=open(f"./{file_name}.mp{music_or_video}", "rb"), caption=f"{file_caption}")
                    elif ch_actions == "video":
                        context.bot.send_chat_action(chat_id, ChatAction.UPLOAD_VIDEO)
                        context.bot.send_video(chat_id=chat_id, video=open(f"./{file_name}.mp{music_or_video}", "rb"), caption=f"{file_caption}")

                if os.path.exists(f"{file_name}.mp{music_or_video}"):
                    os.remove(f"{file_name}.mp{music_or_video}")
        if what_is_link_type == url_check_regex_podcast:
            context.bot.send_message(chat_id=chat_id, text="به دلیل محدودیت ارسال فایل های حجم بالا توسط ربات ها از سمت تلگرام ، امکان ارسال پادکست وجود ندارد...")
        elif what_is_link_type == url_check_regex_podcast_app:
            context.bot.send_message(chat_id=chat_id, text="به دلیل محدودیت ارسال فایل های حجم بالا توسط ربات ها از سمت تلگرام ، امکان ارسال پادکست وجود ندارد...")
        elif what_is_link_type == url_check_regex:
            download_file_rj("3",files_url["music"],regex_music_and_video["music"], "music")
        elif what_is_link_type == url_check_regex_app:
            download_file_rj("3",files_url["music"],regex_music_and_video["music"], "music")
        elif what_is_link_type == url_check_regex_playlist:
            web = Browser()
            web.go_to(url)
            play_list_source_page = web.get_page_source()
            web.close_current_tab()
            soup_playlist = BeautifulSoup(play_list_source_page, "html.parser")
            list_artists_playlist = soup_playlist.findAll("span", {"class": "artist"})
            list_songs_playlist = soup_playlist.findAll("span", {"class": "song"})
            playlist_count = 0
            context.bot.send_message(chat_id=chat_id, text="🔴 توجه 🔴 : \n\n ممکن است برخی آهنگ های موجود در پلی لیست ناقص فرستاده شوند و یا فرستاده نشود و پیام لینک اشتباه است نمایش داده شود . در صورت نیاز بعد از اتمام دانلود پلی لیست و مشاهده پیام :)  لینک آهنگ فرستاده نشده یا خراب را از اپلیکیشن یا وب سایت رادیو جوان ارسال کنید.🌷🌹")
            for artists in list_artists_playlist:
                re_artists = re.findall(r"(?=>).*(?=<)", str(artists))
                re_songs = re.findall(r"(?=>).*(?=<)", str(list_songs_playlist[playlist_count]))
                url = f"www.radiojavan.com/mp3s/mp3/{re_artists[0]}-{re_songs[0]}"
                playlist_count += 1
                url = url.replace(" ", "-")
                url = url.replace("['>", "")
                url = url.replace("']", "")
                url = url.replace(">", "")
                download_file_rj("3", files_url["music"], regex_music_and_video["music"], "music")

        elif what_is_link_type == url_check_regex_playlist_app:
            web = Browser()
            web.go_to(url)
            play_list_source_page = web.get_page_source()
            web.close_current_tab()
            soup_playlist = BeautifulSoup(play_list_source_page, "html.parser")
            list_artists_playlist = soup_playlist.findAll("span", {"class": "artist"})
            list_songs_playlist = soup_playlist.findAll("span", {"class": "song"})
            playlist_count = 0
            context.bot.send_message(chat_id=chat_id,
                                     text="🔴 توجه 🔴 : \n\n ممکن است برخی آهنگ های موجود در پلی لیست ناقص فرستاده شوند و یا فرستاده نشود و پیام لینک اشتباه است نمایش داده شود . در صورت نیاز بعد از اتمام دانلود پلی لیست و مشاهده پیام :)  لینک آهنگ فرستاده نشده یا خراب را از اپلیکیشن یا وب سایت رادیو جوان ارسال کنید.🌷🌹")
            for artists in list_artists_playlist:
                re_artists = re.findall(r"(?=>).*(?=<)", str(artists))
                re_songs = re.findall(r"(?=>).*(?=<)", str(list_songs_playlist[playlist_count]))
                url = f"www.radiojavan.com/mp3s/mp3/{re_artists[0]}-{re_songs[0]}"
                playlist_count += 1
                url = url.replace(" ", "-")
                url = url.replace("['>", "")
                url = url.replace("']", "")
                url = url.replace(">", "")
                download_file_rj("3", files_url["music"], regex_music_and_video["music"], "music")
        elif what_is_link_type == url_check_regex_video_app:
            try:
                context.bot.send_message(chat_id=chat_id, text="متاسفانه به دلیل محدودیت حجم آپلود فایل توسط ربات ها از سمت تلگرام ، موزیک ویدیو فقط با کیفیت 480p LQ برایتان آپلود خواهد شد . \n\n منتظر دریافت موزیک ویدیو باشید...")
                download_file_rj("4", files_url["video_lq"], regex_music_and_video["video"], "video")
            except:
                pass
        elif what_is_link_type == url_check_regex_video:
            try:
                context.bot.send_message(chat_id=chat_id, text="متاسفانه به دلیل محدودیت حجم آپلود فایل توسط ربات ها از سمت تلگرام ، موزیک ویدیو فقط با کیفیت 480p LQ برایتان آپلود خواهد شد .\n\n منتظر دریافت موزیک ویدیو باشید...")
                download_file_rj("4", files_url["video_lq"], regex_music_and_video["video"], "video")
            except:
                pass
        context.bot.send_message(chat_id=chat_id, text=":)")
        break

Example #22

0

Show file

File: models.py Project: MaxMcElhone/FluffyScraper

    def coursework(user):
        root_url = 'https://stinet.southeasttech.edu'
        coursework_pages = []
        coursework = []

        userName = user.get('user')
        password = user.get('password')
        web = Browser()
        web.fullscreen_window()

        web.go_to(root_url)

        #login
        web.type(userName, id="userName")
        web.type(password, id="password")
        web.click(id="siteNavBar_btnLogin")

        #get the html and add it to the parser
        soup = BeautifulSoup(web.get_page_source(), "html5lib")
        mycourses = soup.find(id='myCourses')
        for link in mycourses.find_all('a'):
            if "/ICS/Academics/" in link.get('href'):
                coursework_pages.append(link.get('href'))

        #first attempt at getting getting assignments
        for page in coursework_pages:
            className = None
            web.go_to(root_url + page + "Coursework.jnz")
            soup = BeautifulSoup(web.get_page_source(), "html5lib")
            for data in soup.select('.sidebar-link-title a'):
                className = data.text
            #assignment1 = soup.find(id='pg0_V__dueNext__rptDueNext_ctl00__hypAssign')
            #assignment2 = soup.find(id='pg0_V__dueNext__rptDueNext_ctl01__hypAssign')
            assignment1 = try_find(
                soup, 'pg0_V__dueNext__rptDueNext_ctl00__hypAssign')
            assignment2 = try_find(
                soup, 'pg0_V__dueNext__rptDueNext_ctl01__hypAssign')
            course = {
                'courseName': className,
                'courseUrl': root_url + page + "Coursework.jnz",
            }

            # course = {
            #     'courseName': className,
            #     'courseUrl': root_url + page + "Coursework.jnz",

            if assignment1 is not None:
                course['assignment1'] = assignment1.text
                course['assignment1Desc'] = assignment1.get('aria-label')
                course['assignment1Link'] = root_url + assignment1.get('href')
                course['assignment1Progress'] = soup.find(
                    id='pg0_V__dueNext__rptDueNext_ctl00__lblInfo').text
            else:
                course['assignment1'] = ''
                course['assignment1Desc'] = ''
                course['assignment1Link'] = ''
                course['assignment1Progress'] = ''
            if assignment2 is not None:
                course['assignment2'] = assignment2.text
                course['assignment2Desc'] = assignment2.get('aria-label')
                course['assignment2Link'] = root_url + assignment2.get('href')
                course['assignment2Progress'] = soup.find(
                    id='pg0_V__dueNext__rptDueNext_ctl01__lblInfo').text
            else:
                course['assignment2'] = ''
                course['assignment2Desc'] = ''
                course['assignment2Link'] = ''
                course['assignment2Progress'] = ''
            # }

            coursework.append(course.copy())

        web.close_current_tag()
        return coursework