def mars_Images():
    from splinter import Browser
    from bs4 import BeautifulSoup
    image_dict = {}
    browser = Browser('chrome', headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    time.sleep(3)
    browser.click_link_by_id("full_image")
    elem = browser.find_link_by_partial_href("PIA")
    image_url = elem['href']
    browser.quit()
    browser2 = Browser('chrome', headless=False)
    url2 = image_url
    browser2.visit(url2) 
    browser2 = Browser('chrome', headless=False)
    url2 = image_url
    browser2.visit(url2)
    elem = browser2.find_link_by_partial_href("/spaceimages/images")
    featured_image_url = elem['href']
    
    
    image_dict["featured_image_url"] = featured_image_url
    
    return image_dict
Beispiel #2
0
def scrape(pokemon):
    url = f'https://bulbapedia.bulbagarden.net/wiki/{pokemon}_(Pok%C3%A9mon)'
    driver = webdriver.Chrome()

    #Open browser
    browser = Browser('chrome')
    browser.visit(url)

    #Turn webpage into html
    html = browser.html
    soup = bs(html, 'lxml')

    #Find the picture of the Pokemon and click on it until it's just the .png file
    browser.execute_script("window.scrollTo(0, 400);")
    links_found = browser.find_link_by_partial_href(f'{pokemon}.png').click()
    time.sleep(2)
    browser.execute_script("window.scrollTo(0, 400);")
    image = browser.find_by_id('file').click()
    time.sleep(2)
    pokemon_url = browser.url

    #Store it in a dictionary
    pokemon_image = {'name': pokemon,
                     'url': pokemon_url}
    browser.quit()

    return pokemon_image

    big_ol_pokemon_list = csv['Name']
def scrape(pokemon):
    import pandas as pd
    from bs4 import BeautifulSoup as bs
    import requests
    from splinter import Browser
    import time

    #Latest Headline / Paragraph
    url = f'https://bulbapedia.bulbagarden.net/wiki/{pokemon}_(Pok%C3%A9mon)'

    #Open browser
    browser = Browser('chrome', headless=True)
    browser.visit(url)

    #Turn webpage into html
    html = browser.html
    soup = bs(html, 'lxml')

    #Find the picture of the Pokemon and click on it until it's just the .png file
    links_found = browser.find_link_by_partial_href(f'{pokemon}.png').click()
    image = browser.find_by_id('file').click()
    pokemon_url = browser.url

    #Store it in a dictionary
    pokemon_image = {'URL': pokemon_url}
    browser.quit()

    return pokemon_image
	def find_hemisperes(name):
	    browser = Browser('chrome', **executable_path, headless=False)
	    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
	    browser.visit(url)
	    browser.click_link_by_partial_text(name)
	    links_found = browser.find_link_by_partial_href(name.split()[0].lower())
	    url = links_found['href']
	    dic = {"title": f"{name} Hemisphere", "img_url": url}
	    hemisphere_image_urls.append(dic)
	    browser.quit()
def scrape():
    #setup
    response = {}   
    
    executable_path = {'executable_path': os.path.join("C:/","Users","kling","UNCC Data Analytics","chromedriver.exe")}
    
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument("--test-type") 
    browser = Browser('chrome', **executable_path, headless=False)
    #retrieve text news about mars
    browser.visit(nasa_url)
    time.sleep(5)
    #get first article and follow link
    article_link = browser.find_link_by_partial_href('/news/')[0].click()
    html = browser.html
    soup = bs(html, "html.parser")
    news_title = soup.find("title").text
    news_title = news_title.strip('\n')
    #Get all paragraphs from article, strip tags and add them together into one block of text
    news_p = soup.find_all("p")
    paragraph= news_p[1]
    
    response['title'] = news_title
    response['paragraph'] = paragraph

    #get featured image
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(jpl_url)
    link = driver.find_element_by_partial_link_text("FULL IMAGE").click()
    time.sleep(2)
    images = driver.find_elements_by_class_name('fancybox-image')
    for image in images:
        image_url = image.get_attribute('src')
        print(image_url)

    response['featured_img'] = image_url
    # img=requests.get(image_url)#fetch image
    # with open('featured_image.jpg','wb') as writer:#open for writing in binary mode
    #     writer.write(img.content)#write the image

    #Retrieve weather data
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(twitter_url)
    tweet = driver.find_element_by_css_selector('p.tweet-text').text
    response['weather'] = tweet

    #Get Mars facts
    facts = pd.read_html(facts_url)[0]
    response['facts'] = facts.to_html()

    return(response)
Beispiel #6
0
def scrape(username, password):
    appartments = []

    browser = Browser('chrome', headless=True)
    login(browser, username, password)

    browser.click_link_by_text('Lgh')
    links = browser.find_link_by_partial_href(
        'https://nya.boplats.se/objekt/1hand/')

    for l in links:
        appartments.append(extract_table_info(browser, l))

    for a in appartments:
        add_details(browser, a)

    return appartments
Beispiel #7
0
def getRoutes(start, end):
    browser = Browser(driver_name="firefox")
    browser.visit('https://www.hopstop.com/search?xfr=cityscape')
    print(browser.url)
    browser.fill('address1', str(start))
    browser.fill('address2', str(end))
    browser.find_by_name('get_dirs').click()
    print(browser.url)
    if browser.is_text_present('Did you mean?'):
        browser.click_link_by_href("#")
        if browser.is_text_present('Did you mean?'):
            browser.click_link_by_href("#")
    browser.click_link_by_href("#")
    links = browser.find_link_by_partial_href("/station?stid")
    results = []
    for link in links:
        results.append(link.value)
    return results
Beispiel #8
0
def getRoutes(start, end):
    browser = Browser(driver_name="firefox")
    browser.visit("https://www.hopstop.com/search?xfr=cityscape")
    print(browser.url)
    browser.fill("address1", str(start))
    browser.fill("address2", str(end))
    browser.find_by_name("get_dirs").click()
    print(browser.url)
    if browser.is_text_present("Did you mean?"):
        browser.click_link_by_href("#")
        if browser.is_text_present("Did you mean?"):
            browser.click_link_by_href("#")
    browser.click_link_by_href("#")
    links = browser.find_link_by_partial_href("/station?stid")
    results = []
    for link in links:
        results.append(link.value)
    return results
def JPL_image():

    browser = init_browser()

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    browser.visit(url_jpl)
    #for button in buttons:
    browser.find_link_by_partial_text('FULL IMAGE').click()
    browser.is_element_not_present_by_id('images', wait_time=2)
    browser.find_link_by_partial_text('more info').click()

    link = browser.find_link_by_partial_href('largesize')

    image_url = link.html.split("=")
    image_url = link.html.split("=")[-1].lstrip('"')
    image_url = image_url.rstrip('">')

    featured_image_url = 'https://www.jpl.nasa.gov' + image_url

    return featured_image_url
def scrape(author):
    print("\nRETRIEVING DATA FOR:", author, "\n")
    print("\nINITIALIZING CRAWLER\n")
    # Visit URL
    browser = Browser(
        driver_name='chrome',
        headless=True)  #headless=False will show the browser navigation
    url = "https://scholar.google.com.br/"
    browser.visit(url)
    browser.fill('q', author)

    # Find and click the 'search' button
    button = browser.find_by_name('btnG')
    time.sleep(1)  # needs to sleep for the button to become active
    button.click()

    # If the profile doesn't exist, stop.
    profile_check = browser.html
    if "feather-72.png" not in profile_check:
        print(
            "\nERROR: PROFILE DOES NOT EXIST. PLEASE CHECK YOUR QUERY OR TYPE ANOTHER NAME.\n"
        )
        return

    # Find and click the first link (if profile exists).
    button = browser.find_link_by_partial_href('citations?user='******'gsc_bpf_more')
    check_button = browser.evaluate_script(
        'document.getElementById("gsc_bpf_more").disabled')
    while check_button == False:
        time.sleep(1)
        check_button = browser.evaluate_script(
            'document.getElementById("gsc_bpf_more").disabled')
        button.click()

    #get html
    soup = BeautifulSoup(browser.html, 'html.parser')
    soup.findAll("td", {"class": "gsc_a_t"})

    print("\nBUILDING PAPERS DICTIONARY.\n")
    papers = []
    table = soup.find("table", id="gsc_a_t")
    for tr in table.find_all('tr')[2:]:
        for td in tr.find_all("td", {"class": "gsc_a_t"}):
            paper = {}
            text = re.sub(
                "[\'\"]", "",
                tr.find("a", {
                    "class": "gsc_a_at"
                }).get_text()).strip()  # evita erro de sintaxe no sql
            paper['title'] = text
            authors = tr.find("div", {
                "class": "gs_gray"
            }).get_text().split(',')[:5]
            authors = [a for a in authors if a != "..."
                       ]  # in some cases, the 4th author might be ...
            authors = [
                a.strip().upper() for a in authors
            ]  #remove espaçamento antes de alguns nomes e resolve case sensitiveness
            authors = [re.sub("[\'\"]", "", a)
                       for a in authors]  # evita erro de sintaxe no sql
            paper['authors'] = authors
            papers.append(paper)
    return papers
Beispiel #11
0
def scrape():
    print("scrape_mars    scrape rtn")

    #tk moved imports to here
    #Imports
    from splinter import Browser
    from bs4 import BeautifulSoup as bs
    import pandas as pd
    import requests
    import time
    import re

    #tk moved to hetk
    #def init_browser():
    executable_path = {"executable_path": "chromedriver.exe"}
    #tk return Browser("chrome", **executable_path, headless=True)
    browser = Browser("chrome", **executable_path, headless=False)

    #tk browser = init_browser()
    mars_data_scrape = {}

    mars_news = 'https://mars.nasa.gov/news/'
    browser.visit(mars_news)
    time.sleep(2)
    html = browser.html
    news_soup = bs(html, 'html.parser')

    #Data Scrape
    print("#Data Scrape")

    news_title = news_soup.find('div', class_='content_title').get_text()
    news_p = news_soup.find('div', class_='article_teaser_body').get_text()
    time.sleep(2)

    mars_data_scrape["data1"] = news_title
    mars_data_scrape["data2"] = news_p

    #Paths
    print("#Paths")

    #executable_path = {"executable_path": "chromedriver"}
    #browser = Browser("chrome", **executable_path, headless=True)
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl_url)

    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(2)
    browser.click_link_by_partial_text('more info')
    time.sleep(2)
    browser.click_link_by_partial_text('.jpg')

    #Soup
    print("#Soup")

    html = browser.html
    jpl_soup = bs(html, 'html.parser')

    featured_img_url = jpl_soup.find('img').get('src')

    mars_data_scrape["image"] = featured_img_url

    #Weather
    print("#Weather")

    weather_url = 'https://twitter.com/marswxreport?lang=en'
    html = requests.get(weather_url)
    beautiful_soup = bs(html.text, 'html.parser')

    #tk mars_weather = weather_soup.find_all(string=re.compile("Sol"),
    #tk class_ = "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")[0].text

    #tk mars_data_scrape["weather"] = mars_weather

    #SpaceFacts
    print("#SpaceFacts")

    mars_facts_url = 'https://space-facts.com/mars/'
    table_df = pd.read_html(mars_facts_url)[0]
    table_df.columns = ["description", "value"]
    table_df = table_df.set_index('description', drop=True)
    mars_data_scrape["table"] = table_df.to_html()

    # In[35]:
    print("#In35")

    #executable_path = {"executable_path": "chromedriver.exe"}
    #browser = Browser("chrome", **executable_path, headless=True)
    hem_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hem_url)

    html = browser.html
    hem_soup = bs(html, 'html.parser')

    #Final
    print("#Final")

    hem_img_urls = []
    hem_dict = {
        'title': [],
        'img_url': [],
    }

    x = hem_soup.find_all('h3')

    for i in x:
        t = i.get_text()
        title = t.strip('Enhanced')
        browser.click_link_by_partial_text(t)
        hem_url = browser.find_link_by_partial_href('download')['href']
        hem_dict = {'title': title, 'img_url': hem_url}
        hem_img_urls.append(hem_dict)
        browser.back()

    mars_data_scrape["hemispheres"] = hem_img_urls

    #tk added print
    print(mars_data_scrape)

    return mars_data_scrape
			element = browser.find_by_name('avc').first
			element.select('4')

			browser.fill('sic', '7759')
			browser.fill('totalSale', '10')

			igArray = ['99','98','95','90','87','85','83','80','77','75','73','70','65','60','40','30']

			for x in range(16):
				element = browser.find_by_name('obligorIgCode').first
				element.select(igArray[x])
				browser.find_by_name('UpdateButton').first.click()
				pdData()
				browser.driver.save_screenshot(typeaName + countryList[conIndex] + '_' + igArray[x] +'.png')

			browser.find_link_by_partial_href('/CNETCORP/cpmScenarios.do').first.click()

		else:
			# create the new scenario
			if cType == 0:
				typeaName = 'Existing_Scotia_Public_'
			elif cType == 1:
				typeaName = 'Existing_Scotia_Private_'
			else:
				typeaName = 'Non_Scotia_Public_'

			browser.find_by_name('addScenario').first.click()
			browser.fill('scName', typeaName + countryList[conIndex])
			browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b')
			browser.type('scEffDate', '2014-10-31')
			browser.find_by_name('update').first.click()
def scrape():
    #Set up path for browser
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #Open URL
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    #Grab latest headline
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.find('div',
                           class_='image_and_description_container').find(
                               'div', class_='content_title').find('a').text
    news_p = soup.find(class_='article_teaser_body').text

    #Navigate to JPL Mars featuered image
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    browser.visit(jpl_url)
    browser.find_by_id('full_image').click()

    #Navigate to image page
    browser.find_link_by_partial_text('more info').click()

    #Get to fullsize image
    browser.find_link_by_partial_href('/spaceimages/images').click()

    #Scrape image URL
    featured_image_url = browser.url

    #Get Mars facts with Pandas
    facts_url = 'https://space-facts.com/mars/'

    tables = pd.read_html(facts_url)

    #Slice off other tables
    df = tables[0]

    df = df.rename({'0': 'Description', '1': 'Mars'})

    df.reset_index()

    #Convert to HTML
    facts_table = df.to_html()

    #Get Mars hemisphere pictures
    hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    browser.visit(hemi_url)
    soup = BeautifulSoup(browser.html)

    img_title = []
    title = soup.find_all('h3')
    for x in title:
        img_title.append(x.text)

    images = []
    counter = 0
    for x in img_title:
        browser.find_by_css('img.thumb')[counter].click()
        images.append(browser.find_by_text('Sample')['href'])
        counter = counter + 1
        browser.back()

    hemisphere_image_urls = []
    counter = 0
    for x in title:
        hemisphere_image_urls.append({
            'title': img_title[counter],
            'img_url': images[counter]
        })
        counter = counter + 1

    browser.quit()

    return {
        'headline': news_title,
        'article_detail': news_p,
        'feat_img': featured_image_url,
        'table': facts_table,
        'hemisphere_imgs': hemisphere_image_urls
    }
Beispiel #14
0
def get_data():
    # 1 Nasa news *** USING BROWSER = SPLINTER ***
    browser = Browser('chrome')
    url = "https://mars.nasa.gov/news/"
    #visit url
    browser.visit(url)
    # HTML object
    mars_html = browser.html
    # Parse HTML
    soup = bs(mars_html, "html.parser")
    # Collect News Title and Paragraph
    news_title = soup.find("div", class_="content_title").text.strip()
    print(news_title)
    news_paragraph = soup.find('div', class_="article_teaser_body").text
    print(news_paragraph)

    # Close the browser after scraping
    browser.quit()

    #2- JPL Mars Space Images - Featured Image
    browser = Browser('chrome')
    image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)"
    #go to url
    browser.visit(image_url)
    #navigate to link
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(2)
    browser.click_link_by_partial_text('more info')
    image_html = browser.html
    image_soup = bs(image_html, "html.parser")
    image_path = image_soup.find('figure', class_='lede').a['href']
    featured_image_url = "https://www.jpl.nasa.gov/" + image_path
    print(featured_image_url)
    # Close

    #3- Mars Weather
    browser = Browser('chrome')
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    weather_soup = bs(html, 'html.parser')
    mars_weather = weather_soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text.strip()
    print(mars_weather)
    # Close
    browser.quit()

    #5-Mars Hemispheres
    #create dictionaries
    hemisphere_img_urls = []
    hemisphere_dicts = {"title": [], "img_url": []}
    # url
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser = Browser("chrome")
    browser.visit(url)
    home_page = browser.html
    #HTML & Parsing
    hemispheres_soup = bs(home_page, "html.parser")
    results = hemispheres_soup.find_all("h3")
    # Use loop
    for result in results:
        title = result.text
        print(title)
        #title
        title = title[:-9]
        print(title)
        browser.click_link_by_partial_text(title)
        img_url = browser.find_link_by_partial_href("download")["href"]
        print(img_url)
        hemisphere_dicts = {"title": title, "img_url": img_url}
        hemisphere_img_urls.append(hemisphere_dicts)
        browser.visit(url)
    # Close
    browser.quit()

    mars_data = {
        "title": title,
        "content": news_p,
        "featured_image_url": featured_image_url,
        "latest_weather": mars_weather,
        "image_data": hemisphere_img_urls,
    }
    existing = mars_collection.find_one()
    if existing:
        mars_data['_id'] = existing['_id']
        mars_collection.save(mars_data)
    else:
        mars_collection.save(mars_data)
    return mars_data
Beispiel #15
0
def bundle(argv):
	username = ""
	password = ""
	all = False
	overwrite = False
	skip = False
	try:
		opts, args = getopt.getopt(argv, "hu:p:aso", ["username="******"password="******"-h":
			print_help()
			exit()
		elif opt == "-a":
			download_all_warning()
			all = True
		elif opt == "-o":
			overwrite = True
		elif opt == "-s":
			skip = True
		elif opt in ("-u", "--username"):
			username = arg
		elif opt in ("-p", "--password"):
			password = arg
	del argv, args, opts
	
	if not username:
		username = input("Username (email): ")
		if not username:
			print("Empty, exiting.")
			exit(2)
	if not password:
		password = input("Password: "******"Empty, exiting.")
			exit(2)
	
	browser = Browser()
	print("Logging in...")
	browser.visit('https://bundleofholding.com/user/login')
	browser.fill('users_email', username)
	browser.fill('password', password)
	browser.find_by_name('submit').click()

	if (len(browser.find_by_css("div.logged-in")) > 0):
	#if browser.is_text_present("Wizard's Cabinet"):
		print("Getting lists...")
		browser.visit('https://bundleofholding.com/download/list')
	else:
		print("Failed to log in.")
		if (input("Quit browser? */n ") != "n"):
			browser.quit()
		exit()

	bListBox = browser.find_by_id('overview')
	bListList = bListBox.find_by_tag('a')
	bundles = []
	for e in bListList:
		bundles.append( (e.value, e['href']) )

	del bListBox, bListList

	bundle_count = len(bundles)
	item = 0
	vault = {}
	for b in bundles:
		item += 1
		print("\tFile list {0} of {1}.".format(item, bundle_count))
		vault[b[0]] = []
		browser.visit(b[1])
		bLinks = browser.find_link_by_partial_href('file_id')
		# todo: get file list with file sizes if possible
		# problem: not all pages have "core-bundle" element; older ones are uglier
		# xpath span/a?
		for e in bLinks:
			vault[b[0]].append( (e.value, e['href']) )
			
	del item, bLinks

	print("\n\n")

# Chose to make command line parameter only to help reinforce FAQ.
#	if not all:
#		download_all_warning()
#		
#		totalfiles = 0
#		for bundle, files in vault.items():
#			totalfiles += len(files)
#
#		if (input("There are {0} bundles with a total of {1} files. Download all? y/* ".format(len(vault), totalfiles) ) == "y"):
#			all = True
#		del totalfiles
				
	rx = re.compile("[^\w _()'-]+")
	cookies = browser.cookies.all()
	currentBundle = 1
	for bundle, files in vault.items():
		length = len(files)
		print("({2}/{3}) {0} has {1} files.".format(bundle, length, currentBundle, bundle_count) )
		# todo: accept input (text file?) of accept list rather than just 'all', or maybe command line for one bundle to fetch
		if (all or input("\tDownload? y/* ") == "y"):
			print("\t...Downloading {0}".format(bundle))
			p = rx.sub("", bundle)
			os.makedirs(p, exist_ok=True)
			currentFile = 1 
			for f in files:
				#fn = rx.sub("", f[0]) # Or, assume remote's fine.
				print("\t\t({0}/{1}) - {2}".format(currentFile, length, f[0], end=""))
				fn = p + "/" + f[0]
				if (os.path.isfile(fn)):
					if (not skip and (overwrite or input("\tExists. Overwrite? y/* ") == "y")):
						print("Overwrite.")
						pass
					else:
						print("Skip.")
						continue
				r = requests.get(f[1], cookies = cookies, stream=True)
				#idiom taken from a stack overflow result
				with open(fn, 'wb') as fd:
					for chunk in r.iter_content(1000000):
						if chunk:
							fd.write(chunk)
							fd.flush()
							print('.', end='')
							sys.stdout.flush()
				print()
				currentFile+=1
		currentBundle+=1
	del rx, cookies
	
	print("\n")
	browser.quit()
	exit()
Beispiel #16
0
def scrape():
    # browser = init_browser()

    mars_info = {}

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = f'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    news_title = soup.find("div", class_="content_title").text
    news_date = soup.find("div", class_="list_date").text
    news_p = soup.find("div", class_="article_teaser_body").text

    # Dictionary entry from MARS NEWS
    mars_info['news_paragraph'] = news_p
    mars_info['news_title'] = news_title
    mars_info['news_date'] = news_date

    # Visit the url for JPL Featured Space Image
    url2 = (f"https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")
    browser.visit(url2)
    time.sleep(1)
    html2 = browser.html
    soup = BeautifulSoup(html2, 'html.parser')
    image = soup.find("img", class_="thumb")["src"]

    # Make sure to find the image url to the full size `.jpg` image.
    img_jpl = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" + image
    mars_info['img_jpl'] = img_jpl
    print(img_jpl)

    # visit the mars weather report twitter and scrape the latest tweet
    urlt = (f'https://twitter.com/marswxreport?lang=en')
    browser.visit(urlt)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    mars_weather = soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    # create dictionary entry
    mars_info['mars_weather'] = mars_weather

    # visit space facts and scrap the mars facts table
    url_fact = (f"https://space-facts.com/mars/")
    mars_facts = pd.read_html(url_fact)
    mars_df = mars_facts[0]
    mars_df.columns = ['Description', 'Value']
    mars_df.set_index('Description', inplace=True)
    # dictionary entry
    mars_info['mars_facts'] = mars_df

    # scrape images of Mars' hemispheres from the USGS site
    urlmars = (
        f'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    )
    browser.visit(urlmars)
    time.sleep(1)
    htmlm = browser.html
    soup = BeautifulSoup(htmlm, 'html.parser')
    # loop trought collect entries
    img_urls = []
    img_dict = {
        'Title': [],
        'Image URL': [],
    }

    results = soup.find_all('h3')
    for r in results:
        text = r.get_text()
        title = text.strip('Enhanced')
        browser.click_link_by_partial_text(text)
        img_url = browser.find_link_by_partial_href('download')['href']
        img_dict = {'title': title, 'img_url': img_url}
        img_urls.append(img_dict)
        browser.back()
    # create dictionary entries
    mars_info['img_dict'] = img_dict
    print(img_dict)
    return mars_info
Beispiel #17
0
def scrape():
    # In[3]:
   

    #1.1 MARS NEWS------------------------------
    # get latest news from nasa mars exploration page at https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest
    mars_news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    # set up a Browser to get access to js stuff
    executable_path = {"executable_path": "/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)


    # In[4]:


    # visit the website
    browser.visit(mars_news_url)


    # In[5]:


    nasa_news = browser.html
    soup_nasa_news = bs(nasa_news, 'html.parser')
    nasa_news_title = soup_nasa_news.find('div', class_='content_title').text.strip()
    #nasa_news_teaser = soup_nasa_news.find('div', class_="artlce_teaser_body").text.strip()
    nasa_news_teaser = soup_nasa_news.find('div', class_='article_teaser_body').text
    # .find('li', class_='slide').find('div', class_='list_text')

    # print(nasa_news_title)
    # print(nasa_news_teaser)


    # In[6]:


    # 1.2 JPL Mars space images
    # Visit the url for JPL Featured Space Image https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars.
    # Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.
    # Make sure to find the image url to the full size .jpg image.
    # Make sure to save a complete url string for this image.
    nasa_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(nasa_image_url)


    # In[7]:


    button = browser.find_by_id('full_image')
    button.click()


    # In[8]:


    button1 = browser.find_by_text('more info     ')
    button1.click()


    # In[9]:


    featured_image_url = browser.find_link_by_partial_href('spaceimages/images')
    #jpl_image = browser.html
    #soup_jpl_image = bs(jpl_image, 'html.parser')
    #soup_jpl_image
    featured_image_url = featured_image_url['href']


    # In[10]:


    # Mars Weather
    # Visit the Mars Weather twitter account https://twitter.com/marswxreport?lang=en and scrape the latest Mars weather tweet from the page. 
    # Save the tweet text for the weather report as a variable called mars_weather.
    mars_weather_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_weather_url)


    # In[14]:


    html = browser.html
    parsed_tweet = bs(html, 'html.parser')
    mars_weather = parsed_tweet.find('p', class_='tweet-text').text
    # print(mars_weather)


    # In[ ]:





    # In[15]:


    # Mars Facts
    # Visit the Mars Facts webpage https://space-facts.com/mars/ and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    # Use Pandas to convert the data to a HTML table string.
    mars_facts_url = 'https://space-facts.com/mars/'
    browser.visit(mars_facts_url)


    # In[17]:


    mars_df = pd.read_html(mars_facts_url)
    # print(mars_df)


    # In[18]:


    mars_df[1]


    # In[19]:


    mars_facts_df = mars_df[1]
    mars_facts_df = mars_facts_df.to_html()
    mars_facts_df


    # In[35]:


    #Mars Hemispheres
    # Visit the USGS Astrogeology site https://space-facts.com/mars/ to obtain high resolution images for each of Mar's hemispheres.
    # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    # Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.
    base_hem_html = 'https://astrogeology.usgs.gov/' # used later
    mars_hem_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(mars_hem_url)


    # In[36]:


    html = browser.html
    hemisphere_parsed = bs(html,"html.parser")


    # In[37]:


    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    #wait
    # i feel like there should be a "wait" command or something
    time.sleep(1)
    html = browser.html
    page_parsed = bs(html, 'html.parser')


    # In[40]:


    cerberus_image = page_parsed.find('img', class_='wide-image').get('src')
    cerberus_img_html = base_hem_html + cerberus_image
    cerberus_title = page_parsed.find('h2', class_='title').text
    # print(cerberus_img_html)
    # print(cerberus_title)


    # In[45]:


    # rinse-repeat Schiaparelli
    browser.visit(mars_hem_url)
    time.sleep(1)
    html = browser.html
    hemisphere_parsed = bs(html,"html.parser")


    # In[46]:


    browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    time.sleep(1)
    html = browser.html
    page_parsed = bs(html, 'html.parser')


    # In[47]:


    schiaparelli_image = page_parsed.find('img', class_='wide-image').get('src')

    schiaparelli_img_html = base_hem_html + schiaparelli_image
    schiaparelli_title = page_parsed.find('h2', class_='title').text
    # print(schiaparelli_img_html)
    # print(schiaparelli_title)


    # In[48]:


    # rinse-repeat Syrtis
    browser.visit(mars_hem_url)
    time.sleep(1)
    html = browser.html
    hemisphere_parsed = bs(html,"html.parser")


    # In[50]:


    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    time.sleep(1)
    html = browser.html
    page_parsed = bs(html, 'html.parser')


    # In[51]:


    syrtis_image = page_parsed.find('img', class_='wide-image').get('src')

    syrtis_img_html = base_hem_html + syrtis_image
    syrtis_title = page_parsed.find('h2', class_='title').text
    # print(syrtis_img_html)
    # print(syrtis_title)


    # In[52]:


    # rinse-repeat Valles
    browser.visit(mars_hem_url)
    time.sleep(1)
    html = browser.html
    hemisphere_parsed = bs(html,"html.parser")


    # In[54]:


    browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced')
    time.sleep(1)
    html = browser.html
    page_parsed = bs(html, 'html.parser')


    # In[55]:


    valles_image = page_parsed.find('img', class_='wide-image').get('src')

    valles_img_html = base_hem_html + valles_image
    valles_title = page_parsed.find('h2', class_='title').text
    # print(valles_img_html)
    # print(valles_title)


    # In[57]:


    # bring it all together in a dict
    hs_title_img_final = [
        {"title": cerberus_title, "img_src": cerberus_img_html},
        {"title": schiaparelli_title, "img_src": schiaparelli_img_html},
        {"title": syrtis_title, "img_src": syrtis_img_html},
        {"title": valles_title, "img_src": valles_img_html}
    ]
    # print(hs_title_img_final)


    # In[39]:


    #I could probably loop the above section for all hemispheres, but I can't think of how to do it at the moment

    # hs_titles = []
    # hs_urls = []

    # img_title_loc = hemisphere_parsed.find_all('a', class_='h3')

    # for x in img_title_loc:
    #     hs_title.append(hemisphere_parsed.find('h3').text)
    #     hs_urls.append(base_hem_html + hemisphere_parsed.find('a', class_='href')





    # make dictionary out of all collected data for later use in flask app
    mars_info={"nasa_news_title": nasa_news_title,
            "nasa_news_teaser": nasa_news_teaser,
            "featured_image_url":featured_image_url,
            "mars_weather_url":mars_weather_url,
            "mars_weather":mars_weather,
            "mars_facts_df":mars_facts_df,
            "hs_title_img_final":hs_title_img_final    
            }
    browser.quit()
    return mars_info
Beispiel #18
0
def scrape():
    url = 'https://mars.nasa.gov/news/'
    response = requests.get(url)
    soup = bs(response.text, 'lxml')
    title = soup.find('div', class_='content_title')
    article_title = title.text
    paragraph = soup.find('div', class_='rollover_description_inner')
    article_paragraph = paragraph.text
    browser = Browser('chrome')
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(5)
    browser.find_by_id('full_image').click()
    time.sleep(5)
    browser.find_link_by_partial_href('spaceimages/details.php?').click()
    time.sleep(5)
    browser.find_link_by_partial_href('hires.jpg').click()
    featured_image_url = browser.url
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    twitter = browser.html
    soup = bs(twitter, 'lxml')
    i = 0
    tweet = soup.find('p', class_='TweetTextSize')
    tweet = tweet.text.split('pic.twitter.com')[0]
    while tweet.startswith('InSight sol 1') == False:
        i += 1
        tweet = soup.find('p', class_='TweetTextSize')[i]
    mars_weather = tweet
    url = 'https://space-facts.com/mars/'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    html_string = soup.find('table')
    html_table_string = pd.read_html(str(html_string))
    html_table_string = str(html_table_string)
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    hemispheres = [
        'Cerberus Hemisphere', 'Schiaparelli Hemisphere',
        'Syrtis Major Hemisphere', 'Valles Marineris Hemisphere'
    ]
    hemisphere_images = []
    i = 0
    for hemisphere in hemispheres:
        browser.find_link_by_partial_text(' Hemisphere')[i].click()
        time.sleep(2)
        browser.find_link_by_text("Original")
        time.sleep(2)
        url = browser.url
        #url = url + '.tif'
        #url = url[:4] + url[5:]
        #url = url.replace('search/map', 'download')
        hemisphere_dict = {'url': url, 'title': hemisphere}
        hemisphere_images.append(hemisphere_dict)
        browser.back()
        time.sleep(2)
        i += 1
    mars_dict = dict()
    mars_dict = {
        'NASA Mars News': {
            'Article Title': article_title,
            'Article Paragraph': article_paragraph
        },
        'JPL Featured Image': featured_image_url,
        'Mars Weather': mars_weather,
        'Mars Facts': html_table_string,
        'Mars Hemispheres': hemisphere_images
    }

    return mars_dict
Beispiel #19
0
def Mars_Hemispheres():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemisphere_url)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    img_header = soup.find_all("h3")

    title_list = []
    imgs_url_list = []

    for i in img_header:

    try:   
        title = i.get_text()
        browser.click_link_by_partial_text(title)
        imgs_url = browser.find_link_by_partial_href('download')['href']
        title_list.append(title)
        imgs_url_list.append(imgs_url)
        browser.visit(hemisphere_url)

        print('-----------')
        print(title)
        print(img_url)

# Click the 'Next' button on each page
#try:
#    browser.click_link_by_partial_text('next')
          
    except:
        print("Scraping Complete")

    hemisphere_dict = [{"title": title_list[i], "img_url": imgs_url_list[i]}  for i in range(len(img_header))]
    
    return(hemisphere_dict)

def scrape():
    mars_w = {}

    mars_w = {
        "weather":mars_weather()
    }
    mars_w["weather"] = Mars_Weather()

####################################################################################

# Create an instance of our Flask app.
app = Flask(__name__)

# Create connection variable
conn = 'mongodb://*****:*****@app.route('/')
def index():
    # Store the entire team collection in a list
    teams = list(db.team.find())
    print(teams)

    # Return the template with the teams list passed in
    return render_template('index.html', teams=teams)


if __name__ == "__main__":
    app.run(debug=True)

##############################################################3

app = Flask(__name__)

# Use flask_pymongo to set up mongo connection
app.config["MONGO_URI"] = "mongodb://*****:*****@app.route("/")
def index():
    listings = mongo.db.listings.find_one()
    return render_template("index.html", listings=listings)


@app.route("/scrape")
def scraper():
    listings = mongo.db.listings
    listings_data = scrape_craigslist.scrape()
    listings.update({}, listings_data, upsert=True)
    return redirect("/", code=302)


if __name__ == "__main__":
    app.run(debug=True)
Beispiel #20
0
### Key to cracking this problem was use below code
###link_text = soup.find(class_="description").find('h3').get_text()
###browser.click_link_by_partial_text(link_text)
### Thank Dylan for helping me to crack it....
### Then I tested it with below to get the link from next page
### browser.find_link_by_partial_href('download')['href']

# In[1037]:

hemisphere_image_urls = []
temp_dict = {'title': [], 'img_url': []}
capture_text = soup.find_all('h3')
for i in capture_text:
    y = i.get_text()
    val1 = y.strip('Enhanced')
    browser.click_link_by_partial_text(y)
    val2 = browser.find_link_by_partial_href('download')['href']
    temp_dict = {'title': val1, 'img_url': val2}
    hemisphere_image_urls.append(temp_dict)
    #img_url.append(browser.find_link_by_partial_href('download')['href'])
    browser.visit(url)

# In[1038]:

hemisphere_image_urls

# In[1039]:

browser.quit()
def scrape():
    # Get latest news title and text
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    time.sleep(3)
    html = browser.html
    soup = bs(html, 'html.parser')
    # print(soup.prettify())
    content_title_div = soup.find('div', class_='list_text')
    news_title = content_title_div.find('div', class_='content_title').a.text
    news_p = content_title_div.find('div', 'article_teaser_body').text
    final_dct['news_title'] = news_title
    final_dct['news_p'] = news_p
    final_dct
    
    # Get featured img url
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url)
    browser.click_link_by_id("full_image")
    time.sleep(10)
    html = browser.html
    soup = bs(html, 'html.parser')
    img = soup.find('img', class_='fancybox-image')
    img_src = img['src']
    featured_image_url = 'https://www.jpl.nasa.gov' + img_src
    final_dct["featured_img_url"] = featured_image_url

    # Get Mars data table
    url = "https://space-facts.com/mars/"
    d = pd.read_html(url)
    df = pd.DataFrame({})
    df['Attribute'] = d[1]['Mars - Earth Comparison']
    df['Mars'] = d[1]['Mars']
    table_html = df.to_html()
    final_dct['table_html'] = table_html


    # Get hemisphere title and img url
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    links = browser.find_link_by_partial_href('enhanced')
    final_links = []
    for i in links:
        if final_links.count(i['href']) == 0:
            final_links.append(i['href'])
        else:
            pass
    hemisphere_img_urls = []
    main_url = 'https://astrogeology.usgs.gov'
    for i in final_links:
        browser.visit(i)
        html = browser.html
        soup = bs(html, 'html.parser')
        img = soup.find('img', class_='wide-image')
        img_src = img['src']
        img_url = main_url + img_src
        hemisphere_title = soup.find('h2', class_='title').text
        final_title = hemisphere_title.rsplit(' ', 1)[0]
        hemisphere_img_urls.append({
            "title": final_title,
            "img_url": img_url
        })
    final_dct['hemisphere_img_urls'] = hemisphere_img_urls

    return final_dct
Beispiel #22
0
import pandas as pd
from bs4 import BeautifulSoup as bs
from splinter import Browser
import requests
import time

hemisphere_dictionary = []
hemisphere_data = {"Image": [] , "URL": []}

USGS_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser = Browser("chrome", headless = False)
browser.visit(USGS_url)
time.sleep(3)

home = browser.html
USGSsoup = bs(home, "html.parser")
headings = USGSsoup.find_all("h3")

for heading in headings:
    title = heading.text
    print(title)
    browser.click_link_by_partial_text(title)
    time.sleep(3)
    img_url = browser.find_link_by_partial_href("download")["href"]
    print(img_url)
    hemisphere_data = {"Image": title, "URL": img_url}
    hemisphere_dictionary.append(hemisphere_data)
    time.sleep(3)
    browser.visit(USGS_url)

print(hemisphere_dictionary)
def query_iPfam( pdb_structures_query ):

	#
	# open browser
	#

	br = Browser()

	url = 'http://www.ipfam.org/search/keyword'

	br.visit(url)

	#  
	# Search pdb structures vs. interactions
	#

	# make a search qeury with all the pdb structures
	br.find_by_css("#keywords")[0].fill(pdb_structures_query)

	br.find_by_css("input.button").click()

	# all structure interactions
	br.find_by_css(".lozenge > ul:nth-child(2) > li:nth-child(3) > input:nth-child(1)").click()

	# all ligand interactions
	# ...

	# click "show all"
	br.find_by_css("input.button:nth-child(3)").click()

	# show 100 entries
	br.find_by_css("#pdb_matches_table_length > label:nth-child(1) > select:nth-child(1)").first.select("-1")

	# grab all structure's and their interactions links
	count = 0

	pdb_to_url = []

	while True:

		count += 1

		try: 
			pdb_id = br.find_by_css("#pdb_matches_table > tbody:nth-child(2) > tr:nth-child("+str(count)+") > td:nth-child(1) > a:nth-child(1)").first.text
			pdb_url = br.find_by_css("#pdb_matches_table > tbody:nth-child(2) > tr:nth-child("+str(count)+") > td:nth-child(1) > a:nth-child(1)").first['href']
			pdb_to_url.append((pdb_id,pdb_url))
		except exceptions.ElementDoesNotExist: 
			break

	#
	# obtain interactions per pdb
	#
	print "obtaining interactions for each pdb structure..."

	pdb_to_interactions = {}

	interaction_to_url = {}

	for pdb, url in pdb_to_url:

		print "pdb structure: "+pdb

		br.visit(url)

		interaction_status = br.find_by_css("div.lozenge:nth-child(1) > dl:nth-child(3) > dd:nth-child(2) > p:nth-child(1) > label:nth-child(2)").first.text

		n_family_interactions = int(interaction_status.replace("Family (","").replace(")",""))

		if n_family_interactions > 0:
			print "\t\t"+str(n_family_interactions)+" interactions found"

			br.find_by_value("fam_int").first.click() # click family interactions

			family_interactions = br.find_link_by_partial_href("/fam_int/") # @todo: test if this is a correct matcher

			for interaction in family_interactions:

				interaction_url = interaction['href']
				a, b = interaction_url.split("/fam_int/")
				a_pfam_id = a.split("/family/")[1]
				b_pfam_id = b.split("/sequence")[0]

				interaction_neat = (a_pfam_id,b_pfam_id)

				print "\t\t\titeraction: "+interaction_neat[0]+"-to-"+interaction_neat[1]+" url: "+interaction['href'] # e.g. RVP-to-RVP

				interaction_to_url[interaction_neat] = interaction['href']

				if pdb_to_interactions.has_key(pdb):
					pdb_to_interactions[pdb].append(interaction_neat)
				else:
					pdb_to_interactions[pdb] = [interaction_neat]
		else:
			print "\t\t"+str(n_family_interactions)+" interactions found"

			pdb_to_interactions[pdb] = []

	# # save interactions data
	# pickle.dump( pdb_to_interactions, open( "./data/pdb_to_interactions.p", "wb" ) )
	# pickle.dump( interaction_to_url,  open( "./data/interaction_to_url.p", "wb" ) )
	#
	# determine which pdb protein structures interact
	# 	Note: problem, we do not know which of the interacting pfams belong to the native protein
	#

	return pdb_to_interactions, interaction_to_url
hemisphere_image_urls = []

hem_dict = {}

# Parse the resulting html with soup
html = browser.html
hem_soup = soup(html, 'html.parser')

# Write code to retrieve the image urls and titles for each hemisphere.
# Find all titles
titles = hem_soup.find_all('h3')

for i in titles:
    t = i.get_text()
    title = t.strip()
    browser.click_link_by_partial_text(t)

    href = browser.find_link_by_partial_href('_enhanced.tif/full.jpg')['href']
    img_url = f'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars/{href}'

    hem_dict = {'title': title, 'img_url': img_url}
    hemisphere_image_urls.append(hem_dict)

    browser.visit(url)

# Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

# Quit the browser
browser.quit()
Beispiel #25
0
# In[12]:


browser.find_by_id('full_image').click()


# In[14]:


browser.find_by_text('more info     ').click()


# In[15]:


feature_img_link = browser.find_link_by_partial_href('photojournal.jpl.nasa.gov/jpeg')
print(feature_img_link)


# In[16]:


feature_img_link=feature_img_link['href']
print(feature_img_link)


# In[17]:


# Retrieve page with the requests module
response_mars_weather = requests.get(url_Mars_weather)
				browser.find_by_name('addScenario').first.click()
				browser.fill('scName', countryTypeList[conIndex]+typeaName+igType)
				browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b')
				browser.type('scEffDate', '2015-10-31')
				browser.find_by_name('update').first.click()

				browser.find_link_by_text('Obligor').first.click()

				# choose the companyType type
				element = browser.find_by_name('companyType').first
				element.select(str(cType))

				browser.fill('obligorName', companyName)
				browser.find_by_name('ObligorSearch').first.click()
			
				browser.find_link_by_partial_href('javascript:refPortResult')[0].click()

				# select "B-III counterpaty type" to be "corporate"
				element = browser.find_by_name('counterPartyType').first
				element.select('1')
				# select "Classification re Asset Value Correlation" to be "Non-Financial Institution (N)"
				element = browser.find_by_name('avc').first
				element.select('4')

				# select proper IG according to the IG type
				if igType == 'orig':
					pass
				else:
					element = browser.find_by_name('obligorIgCode').first
					# eleVal = element.find_element_by_xpath("//option[@selected='selected']").first.value
					eleVal = element.find_by_xpath('option[@selected="selected"]').first.value
Beispiel #27
0
class Youtube:
    def __init__(self):
        self.log = common.Logger()
        self.settings = common.Settings()
        self.comment_generator = common.CommentGenerator(
        )  # генератор комментариев
        self.browser = Browser('chrome')
        self.subscriptions = []  # наши подписки
        self.comments = common.Comments2()  # база данных комментариев
        common.Subscription.url_list = []  # массив наших подписок
        self.sleep_time_after_visit = 5
        self.our_channel_url = u'https://www.youtube.com/channel/'.format(
            self.settings.get_parameter('address'))  # наш канал
        self.max_subscribers_amount = 1000  # подписываемся если количество подписчиком меньше этого числа
        with open(
                'channels.txt', 'r'
        ) as f:  # файл каналов с которых берем спосок каналов для подписки
            buffer = f.read()
            self.channels_list = buffer.split()
            self.channels_list = filter(bool, self.channels_list)
            self.channels_list = filter(lambda x: not x[0] == '#',
                                        self.channels_list)
            #self.channels_list = [x for x in self.channels_list if not x[0] == '#']
        self.all_channel_mode = True
        self.re_is_cyrillic = regex.compile('[\p{IsCyrillic}]', regex.UNICODE)
        self.comment_not_russian = 'not russian title!'
        self.comment_errors_counter = 0

    def login(self):
        browser = self.browser
        browser.visit('https://accounts.google.com')
        browser.fill('Email', self.settings.get_parameter('login'))
        button = browser.find_by_id('next')
        button.click()
        browser.fill('Passwd', self.settings.get_parameter('password'))
        button = browser.find_by_id('signIn')
        button.click()
        self.log.info('login ok')
        time.sleep(self.sleep_time_after_visit)

    def get_subscriptions(self):
        self.browser.visit('https://www.youtube.com/subscription_manager')
        time.sleep(self.sleep_time_after_visit)
        del self.subscriptions[:]
        links = self.browser.find_link_by_partial_href('/channel/')
        for link in links:
            #if link.visible:
            link_url = link['href']
            if not link_url in common.Subscription.url_list and not self.our_channel_url in link_url:
                self.subscriptions.append(
                    common.Subscription(link_url, link.value))
                #print link.value
            #link.click()
        #self.subscriptions.reverse()
        return links

    def get_user_subscribers(self, user_url):
        if self.all_channel_mode:
            user_url = user_url.url
        self.browser.visit(user_url + '/channels')
        time.sleep(self.sleep_time_after_visit)
        self.log.info('open user {}'.format(user_url))
        links = self.browser.find_link_by_partial_href('/channel/')
        user_subs = []
        for link in links:
            #if link.visible:
            #print link.find_by_id('href').first
            if link['dir'] == 'ltr' and 'yt-ui-ellipsis' in link['class']:
                #print link.value#, link['href'], link['class']
                subs_url = link['href']
                if not subs_url == self.our_channel_url:
                    user_subs.append(subs_url)
        return user_subs

    def get_subscribers_amount(self):
        #elements = self.browser.find_by_xpath('//*[@id="c4-primary-header-contents"]/div/div/div[2]/div/span[2]/span[1]')
        #elements = self.browser.find_by_xpath('//*[@id="c4-primary-header-contents"]/div/div/div[2]/div/span/span[1]')
        #//*[@id="watch7-subscription-container"]/span/span[2]
        elements = self.browser.find_by_id('c4-primary-header-contents')
        spans = elements.find_by_tag('span')
        amount = 0
        for span in spans:
            if span['class'] == 'yt-subscription-button-subscriber-count-branded-horizontal subscribed yt-uix-tooltip':
                amount_str = span['title'].replace(unichr(160), '')
                #print map(ord, list(amount_str))
                amount = int(amount_str)
        return amount

    def open_user_page(self, user_url):
        self.browser.visit(user_url)
        time.sleep(self.sleep_time_after_visit)
        subs = self.get_subscribers_amount()
        return subs

    def open_user_videos_page(self, user_url):
        links = self.browser.find_link_by_partial_href('/videos')
        for link in links:
            if link.visible:
                self.log.info('open videos list {}'.format(link['href']))
                link.click()
                break
        time.sleep(self.sleep_time_after_visit)

    def open_last_user_video(self, user_url, not_commented=True):
        self.open_user_videos_page(user_url)
        links = self.browser.find_link_by_partial_href('watch?')
        url_found = False
        for link in links:
            #if link.visible:
            #print link.find_by_id('href').first
            #if link['dir'] == 'ltr' and 'yt-ui-ellipsis' in link['class']:
            url = link['href']
            #print link.value, url, link['class']
            if 'yt-uix-sessionlink' in link[
                    'class'] and not self.comments.is_video_commented(url):
                self.log.info('open video {}'.format(url))
                #link.click()
                self.browser.visit(url)
                url_found = True
                break
        if not url_found:
            return ''
        else:
            time.sleep(self.sleep_time_after_visit)
            return url

    def find_user_of_current_video(self):
        path = '//*[@id="watch7-user-header"]/a'
        elements = self.browser.find_by_xpath(path)
        return elements.first['href']

    def press_like(self):
        path_notlike = '//*[@id="watch8-sentiment-actions"]/span/span[2]/button'
        path_like = '//*[@id="watch8-sentiment-actions"]/span/span[1]/button'
        elements = self.browser.find_by_xpath(path_like)
        b = elements.first
        #print b['title']
        if b.visible:
            b.click()

    def press_subscribe(self):
        path = '//*[@id="watch7-subscription-container"]/span/button[1]'
        elements = self.browser.find_by_xpath(path)
        b = elements.first
        already_subscribed = False
        if b['data-is-subscribed']:
            #print 'already subscribed!'
            already_subscribed = True
        else:
            if b.visible:
                b.click()
        return already_subscribed

    #//*[@id="watch7-subscription-container"]/span/span[1]
    #//*[@id="c4-primary-header-contents"]/div/div/div[2]/div/span[2]/span[1]
    def have_it_cyrillic_letters(self, buffer):
        return not len(regex.findall(self.re_is_cyrillic, buffer)) == 0

    def comment_on_video(self):
        url = self.browser.driver.current_url
        if not 'watch?' in url:
            self.log.error('not video page!')
            return ''
            #raise Exception('not video page!')
        user = self.find_user_of_current_video()
        self.subscriptions.insert(0, common.Subscription(
            user, ''))  #добавляем пользователя в подписки
        if self.comments.is_user_commented(user):
            print 'user already commented {}'.format(user)
        if self.comments.is_video_commented(url):
            self.log.error('video already commented!')
            return ''
        #print self.get_subscribers_amount()
        #проверяем есть ли в названии русские буквы
        title = self.browser.title
        if not self.have_it_cyrillic_letters(title):
            msg = self.comment_not_russian
            self.log.error(msg)
            return msg
        time.sleep(5)
        #raise Exception('like')
        self.browser.driver.execute_script("window.scrollTo(0, 350)")
        time.sleep(10)
        #browser.find_by_tag('html').first.type(Keys.PAGE_DOWN)

        elements_mode = 0  # разные типы комментариев и кнопок
        elements = self.browser.find_by_id('yt-comments-sb-standin')
        if len(elements) == 0:
            # комментарии отключены
            elements = self.browser.find_by_xpath(
                '//*[@id="comment-section-renderer"]/div[1]/div[2]')
            if len(elements) == 0:
                msg = 'Cannot find field for comment!'
                self.log.error(msg)
                raise Exception(msg)
                #return ''
            elements_mode = 1
        if elements.first.visible:
            elements.first.click()
        else:
            raise Exception('Comment element not visible!')
        time.sleep(3)
        print 'elements mode', elements_mode

        #пишем комментарий
        if elements_mode == 0:
            elements = self.browser.find_by_xpath(
                '//*[@id="yt-comments-sb-container"]/div[2]/div[1]/div[1]')
        else:
            elements = self.browser.find_by_xpath(
                '//*[@id="comment-simplebox"]/div[1]')
        if len(elements) == 0:
            raise Exception('Comment element not found!')
            return ''
        elements.first.click()
        comment_text = self.comment_generator.get_comment()
        try:
            elements.first.fill(comment_text)
        except:
            msg = 'Error when fill comment!'
            self.log.error(msg)
            self.comment_errors_counter += 1
            if self.comment_errors_counter > 5:
                raise Exception(msg)
            return ''
        #keys = elements.first.type(comment_text, slowly=True) перестало работать, выдает исключение в цикле
        #for key in keys:
        #    pass

        # нажимаем кнопку
        if elements_mode == 0:
            elements = self.browser.find_by_xpath(
                '//*[@id="yt-comments-sb-container"]/div[2]/div[1]/div[3]/button[2]'
            )
        else:
            elements = self.browser.find_by_xpath(
                '//*[@id="comment-simplebox"]/div[3]/button[2]')
        if len(elements) == 0:
            raise Exception('Cannot find send comment button!')
        elements.first.click()
        time.sleep(3)
        #print elements.first.text
        #self.comments.add(url)
        self.log.info(u'comment video {}'.format(comment_text))

        #подписка
        subscribed_before = self.press_subscribe()
        if subscribed_before:
            # были подписаны ранее
            self.log.error('already subscribed!')
            #return ''
        time.sleep(3)
        # нажимаем лайк
        self.press_like()

        return comment_text

    def get_channel_list(self):
        if not self.all_channel_mode:
            channel_list = self.channels_list
        else:
            channel_list = self.subscriptions
        return channel_list


#subs_cache = []
def scrape():

	# handle Mars News
	news_url = 'https://mars.nasa.gov/news'
	chromedriver = "/usr/local/bin/chromedriver"
	os.environ["webdriver.chrome.driver"] = chromedriver
	driver = webdriver.Chrome(chromedriver)
	driver.get(news_url)
	time.sleep(5)
	html = driver.page_source
	news_soup = BeautifulSoup(html, 'lxml')
	news_results = news_soup.find_all('li', class_="slide")
	text = news_results[0].find_all('a')
	news_title = text[1].text
	news_p = text[0].find('div', class_="rollover_description_inner").text
	driver.close()

	# scrape JPL featured image
	executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
	browser = Browser('chrome', **executable_path, headless=False)
	image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
	browser.visit(image_url)
	browser.click_link_by_partial_text('FULL IMAGE')
	time.sleep(5)
	browser.click_link_by_partial_text('more info')
	links_found = browser.find_link_by_partial_href('images/largesize')
	featured_image_url = links_found['href']
	browser.quit()

	# scrape Mars weather
	weather_url = 'https://twitter.com/marswxreport?lang=en'
	weather_response = requests.get(weather_url)
	weather_soup = BeautifulSoup(weather_response.text, 'html.parser')
	weather_results = weather_soup.find_all('div', class_="js-tweet-text-container")
	mars_weather = weather_results[0].find('p').text

	# scrape Mars facts
	facts_url = 'https://space-facts.com/mars/'
	tables = pd.read_html(facts_url)
	df = tables[0]
	html_table = df.to_html(header=None,index=False)
	html_table = html_table.replace('\n', '')

	# scrape Mars Hemisperes
	hemisphere_image_urls =[]
	# define a function to scrape full resolution image link using splinter
	def find_hemisperes(name):
	    browser = Browser('chrome', **executable_path, headless=False)
	    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
	    browser.visit(url)
	    browser.click_link_by_partial_text(name)
	    links_found = browser.find_link_by_partial_href(name.split()[0].lower())
	    url = links_found['href']
	    dic = {"title": f"{name} Hemisphere", "img_url": url}
	    hemisphere_image_urls.append(dic)
	    browser.quit()
	hemisperes_list = ['Cerberus', 'Schiaparelli', 'Syrtis Major', 'Valles Marineris']
	for hemispere in hemisperes_list:
		find_hemisperes(hemispere)
		time.sleep(2)

	scrape_dic = {
		"news_title" : news_title,
		"news_p" : news_p,
		"featured_image" : featured_image_url,
		"weather" : mars_weather,
		"facts" : html_table,
		"hemispheres" : hemisphere_image_urls
	}

	return scrape_dic
def query_iPfam( pdb_structures_query ):

	#
	# open browser
	#

	br = Browser()

	url = 'http://www.ipfam.org/search/keyword'

	br.visit(url)

	#  
	# Search pdb structures vs. interactions
	#

	# make a search qeury with all the pdb structures
	br.find_by_css("#keywords")[0].fill(pdb_structures_query)

	br.find_by_css("input.button").click()

	# all structure interactions
	br.find_by_css(".lozenge > ul:nth-child(2) > li:nth-child(3) > input:nth-child(1)").click()

	# all ligand interactions
	# ...

	# click "show all"
	br.find_by_css("input.button:nth-child(3)").click()

	# show 100 entries
	br.find_by_css("#pdb_matches_table_length > label:nth-child(1) > select:nth-child(1)").first.select("-1")

	# grab all structure's and their interactions links
	count = 0

	pdb_to_url = []

	while True:

		count += 1

		try: 
			pdb_id = br.find_by_css("#pdb_matches_table > tbody:nth-child(2) > tr:nth-child("+str(count)+") > td:nth-child(1) > a:nth-child(1)").first.text
			pdb_url = br.find_by_css("#pdb_matches_table > tbody:nth-child(2) > tr:nth-child("+str(count)+") > td:nth-child(1) > a:nth-child(1)").first['href']
			pdb_to_url.append((pdb_id,pdb_url))
		except exceptions.ElementDoesNotExist: 
			break

	#
	# obtain interactions per pdb
	#
	print "obtaining interactions for each pdb structure..."

	pdb_to_interactions = {}

	interaction_to_url = {}

	for pdb, url in pdb_to_url:

		print "pdb structure: "+pdb

		br.visit(url)

		interaction_status = br.find_by_css("div.lozenge:nth-child(1) > dl:nth-child(3) > dd:nth-child(2) > p:nth-child(1) > label:nth-child(2)").first.text

		n_family_interactions = int(interaction_status.replace("Family (","").replace(")",""))

		if n_family_interactions > 0:
			print "\t\t"+str(n_family_interactions)+" interactions found"

			br.find_by_value("fam_int").first.click() # click family interactions

			family_interactions = br.find_link_by_partial_href("/fam_int/") # @todo: test if this is a correct matcher

			for interaction in family_interactions:

				interaction_url = interaction['href']
				a, b = interaction_url.split("/fam_int/")
				a_pfam_id = a.split("/family/")[1]
				b_pfam_id = b.split("/sequence")[0]

				interaction_neat = (a_pfam_id,b_pfam_id)

				print "\t\t\titeraction: "+interaction_neat[0]+"-to-"+interaction_neat[1]+" url: "+interaction['href'] # e.g. RVP-to-RVP

				interaction_to_url[interaction_neat] = interaction['href']

				if pdb_to_interactions.has_key(pdb):
					pdb_to_interactions[pdb].append(interaction_neat)
				else:
					pdb_to_interactions[pdb] = [interaction_neat]
		else:
			print "\t\t"+str(n_family_interactions)+" interactions found"

			pdb_to_interactions[pdb] = []

	# # save interactions data
	# pickle.dump( pdb_to_interactions, open( "./data/pdb_to_interactions.p", "wb" ) )
	# pickle.dump( interaction_to_url,  open( "./data/interaction_to_url.p", "wb" ) )
	#
	# determine which pdb protein structures interact
	# 	Note: problem, we do not know which of the interacting pfams belong to the native protein
	#

	return pdb_to_interactions, interaction_to_url
Beispiel #30
0
@author: JIANGHOU
"""

import time
from splinter import Browser
from selenium import webdriver
browser = Browser()

url = 'http://geo.ckcest.cn/'
browser.visit(url)
#人工登陆

#遥感影像
browser.visit('http://geo.ckcest.cn/scientific/InternationalData/list.html')
#然后跳转到一个页码
linkspage = browser.find_link_by_partial_href('#')
paperurls = []
time.sleep(3)
papers = browser.find_link_by_partial_href('remotedetail.html')
browserurl = browser.url
for j in range(0, 10):  #len(papers)
    papers[j].click()
    time.sleep(3)
    window = browser.windows[1]
    paperurls.append(window.url)
    window.close()
for i in range(1, 10):
    browser.visit(paperurls[i])
    time.sleep(15)
    remotelist = browser.find_by_id('remotelist')
    within_elements = remotelist.first.find_by_tag('a')
Beispiel #31
0
def line_login(browser, user_name, password, code):

    """
    lineに自動ログインして、パラメータのカードコードを入力し、チャージする。
    チャージした結果を返す。

    :param browser:ブラウザインスタンス
    :param user_name:ログインユーザネーム
    :param password:ログインパスワード
    :param code:ギフトカードコード
    :return:チャージ結果
    """
    # ログインページを開く
    browser = Browser('firefox')
    url = 'https://store.line.me/home/'
    browser.visit(url)

    # ログインする
    login_submit = browser.find_link_by_partial_href('login')

    if login_submit:
        login_submit.click()
    else:
        html_code = browser.html
        return {
            'code': 4,
            'message': "サイト上に問題が発生しました。(サイトがアクセスできない、またはネットが遅すぎる可能性があります。)",
            'htmlcode': html_code
        }

    username_input_field = browser.find_by_id('id')
    password_input_field = browser.find_by_id('passwd')
    login_submit = browser.find_by_value('Login')

    if username_input_field and password_input_field and login_submit:
        username_input_field.fill(user_name)
        password_input_field.fill(password)
        login_submit.click()
    else:
        html_code = browser.html
        return {
            'code': 4,
            'message': "サイト上に問題が発生しました。(サイトがアクセスできない、またはネットが遅すぎる可能性があります。)",
            'htmlcode': html_code
        }

    # ログイン画像認識があるかどうかチェックする
    #captcha_image_field = browser.find_by_css('img.FnCaptchaImg')

    #メールアドレスまたパスワードをチェックする
    login_alert_field = browser.find_by_css('p.mdMN02Txt')

    if browser.is_element_present_by_css('p.mdMN02Txt'):

        result = login_alert_field.value

        if result.find(unicode('The password you have entered is invalid, or you have not registered your email address with LINE.')) != -1:

            html_code = browser.html

            return {
                'code': 2,
                'message': 'メールアドレスまたはパスワードが正しくありません。',
                'htmlcode': html_code
            }

    # チャージ画面に移動する
    browser.find_by_text('Charge').click()
    browser.windows.current = browser.windows[1]
    browser.find_by_id('70002').click()
    browser.execute_script("charge(this); return false;")

    # チャージする
    code_input_field = browser.find_by_id('FnSerialNumber')

    code_input_field.fill(code)

    time.sleep(9000)

    browser.execute_script("javascript:doCharge(this);return false;")

    result = browser.find_by_css('p.mdLYR11Txt01').value

    browser.quit()

    return result
hemises = hemis_soup.find_all('h3')

# Append the dictionary with the image url string and the hemisphere title to a list.
hemis_dict = {}
hemisphere_image_urls = []

for hemis in hemises:
    hemis_dict["title"] = hemis.text.strip('Enhanced')
    
    # Click on the link with the corresponding hemis
    try:
        browser.click_link_by_partial_text(hemis.text)
    except ElementDoesNotExist:
        print(f"{hemis.text} Image doesn't exist")
    
    # Scrape the image url string 
    hemis_dict["img_url"] = browser.find_link_by_partial_href('download')['href']
      
    hemisphere_image_urls.append(hemis_dict)       
    
    browser.visit(url_hemis) 
    
print(hemisphere_image_urls)


# In[ ]:




Beispiel #33
0
def scrape_info():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Scraping Nasa Mars News
    # Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/)
    # and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

    source = requests.get('https://mars.nasa.gov/news/').text
    soup = bs(source, 'html.parser')
    article = soup.find_all('div', class_='content_title')

    news_title0 = article[0].a.text
    news_title1 = article[1].a.text
    news_title2 = article[2].a.text

    description = soup.find_all('div', class_="rollover_description_inner")
    news_p0 = description[0].text
    news_p1 = description[1].text
    news_p2 = description[2].text

    # Scraping JPL Mars Space Images - Featured Image
    # Return featured_img_url

    #Visit the url for JPL Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
    #Use splinter to navigate the site
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    try:
        browser.click_link_by_id('full_image')
    except:
        browser.click_link_by_partial_text('FULL IMAGE')
    else:
        print("Scraping Full Image Complete")

    check = 0
    try:
        links_found = browser.find_link_by_partial_href('spaceimages/details')
        url2 = links_found[0]["href"]
        browser.click_link_by_partial_text('more info')
        links_found2 = browser.find_link_by_partial_href(
            'spaceimages/images/largesize')
        f1 = links_found2[0]["href"]
        check = 1
    except:
        browser.visit(url2)
        links_found3 = browser.find_link_by_partial_href(
            'spaceimages/images/largesize')
        f2 = links_found3[0]["href"]
    else:
        print("Scraping More Info Complete")

    if check == 1:
        featured_image_url = f1
    else:
        featured_image_url = f2

    # Mars Weather
    # Returns (mars_weather)

    #Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en)
    #and scrape the latest Mars weather tweet from the page.
    #Save the tweet text for the weather report as a variable called `mars_weather`
    source3 = requests.get('https://twitter.com/marswxreport?lang=en').text
    soup = bs(source3, 'html.parser')
    tweets = soup.find_all(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
    #print(tweets[0].text)

    mars_weather = tweets[0].text

    #Mars Facts
    # Returns (mars_facts_table)

    facts = pd.read_html("https://space-facts.com/mars/")

    mars_facts_df = facts[1]
    mars_facts_df.columns = ['Description', 'Value']
    mars_facts_df.set_index("Description", inplace=True)
    mars_facts_df.head()
    mars_facts_table = mars_facts_df.to_html()
    mars_facts_table = mars_facts_table.replace('\n', '')

    # Mars Hemispheres
    # Returns hemisphere_image_urls

    hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemi)
    html = browser.html

    # Get titles for all four mars pictures
    soup = bs(html, 'html.parser')
    hemi_class = soup.find_all('h3')
    cerberus_title = hemi_class[0].text
    schiaparelli_title = hemi_class[1].text
    syrtis_title = hemi_class[2].text
    valles_title = hemi_class[3].text

    # Get Cerberus Information
    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    link1 = (browser.find_link_by_partial_text('Original'))
    cerberus_link = (link1[0]["href"]) + "/full.jpg"
    browser.back()

    # Get Schiaparelli Information
    browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    link2 = (browser.find_link_by_partial_text('Original'))
    schiaparelli_link = (link2[0]["href"]) + "/full.jpg"

    browser.back()

    # Get Syrtis Major Information
    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    link3 = (browser.find_link_by_partial_text('Original'))
    syrtis_link = (link3[0]["href"]) + "/full.jpg"
    browser.back()

    # Get Valles Major Information
    browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced')
    link4 = (browser.find_link_by_partial_text('Original'))
    valles_link = (link4[0]["href"]) + "/full.jpg"
    browser.back()

    marsdata = {
        "news_title0": news_title0,
        "description0": news_p0,
        "news_title1": news_title1,
        "description1": news_p1,
        "news_title2": news_title2,
        "description2": news_p2,
        "JPL_link": featured_image_url,
        "weather_tweet": mars_weather,
        "facts_table": mars_facts_table,
        "title1": cerberus_title,
        "img_url1": cerberus_link,
        "title2": schiaparelli_title,
        "img_url2": schiaparelli_link,
        "title3": syrtis_title,
        "img_url3": syrtis_link,
        "title4": valles_title,
        "img_url4": valles_link
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return marsdata
Beispiel #34
0
"""

import time
from splinter import Browser
from selenium import webdriver  
browser = Browser()

url='http://geo.ckcest.cn/'
browser.visit(url) 
#人工登陆

#文献下载#文献下载
browser.visit('http://geo.ckcest.cn/scientific/literature/techdoc_v.html') 
#然后跳转到一个页码
linkspage = browser.find_link_by_partial_href('#')
paperurls=[]
for i in range(16,19):
    page=linkspage[i]
    page.click()
    time.sleep(3)
    papers = browser.find_link_by_partial_href('techdoc_papers.html')
    browserurl=browser.url    
    for j in range(0,len(papers)):
        papers[j].click()
        time.sleep(3)
        window = browser.windows[1]
        paperurls.append(window.url) 
        window.close()
for i in range(1,len(paperurls)):
    browser.visit(paperurls[i]) 
Beispiel #35
0
def get_data():
    # 1 Nasa news *** USING BROWSER = SPLINTER ***
    browser = Browser('chrome')
    url = "https://mars.nasa.gov/news/"
    #go to url
    browser.visit(url)
    # HTML object
    html = browser.html
    # Parse HTML with BeautifulSoup
    soup = bs(html, "html.parser")
    # Collect News Title and Paragraph
    # Print("Start getting the titles...")
    result["news_title"] = soup.find("div",
                                     class_="content_title").text.strip()
    # print(news_title)
    result["news_p"] = soup.find('div', class_="article_teaser_body").text
    # print(news_p)
    # Print("Got titles and paragraphs...")

    # Close the browser after scraping
    browser.quit()

    #2- JPL Mars Space Images - Featured Image *** USING BROWSER = SPLINTER ***
    browser = Browser('chrome')
    jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)"
    #go to url
    browser.visit(jpl_url)
    #navigate to link
    time.sleep(5)

    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)

    browser.click_link_by_partial_text('more info')
    image_html = browser.html
    jpl_soup = bs(image_html, "html.parser")
    image_path = jpl_soup.find('figure', class_='lede').a['href']
    result["featured_image_url"] = "https://www.jpl.nasa.gov/" + image_path
    # print(featured_image_url)
    # print("Got feature image url")

    # Close the browser after scraping
    browser.quit()

    #3- Mars Weather *** USING BROWSER = SPLINTER ***
    browser = Browser('chrome')
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    time.sleep(5)
    html = browser.html
    weather_soup = bs(html, 'html.parser')
    result["mars_weather"] = weather_soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text.strip()
    # print(mars_weather)
    # print("Got weather info from twitter...")
    # Close the browser after scraping
    browser.quit()

    #Mars Facts
    url = "https://space-facts.com/mars/"
    marsFacts = pd.read_html(url)
    facts = marsFacts[0]
    facts.columns = ['fact', 'Number']
    facts = facts.set_index('fact')['Number'].to_dict()
    result['facts'] = facts
    # print('Got facts...')

    #5-Mars Hemispheres
    #create list/dics
    hemisphere_img_urls = []
    hemisphere_dicts = {"title": [], "img_url": []}
    # url
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser = Browser("chrome")
    browser.visit(url)
    time.sleep(5)
    home_page = browser.html
    #HTML & Parsing
    hemispheres_soup = bs(home_page, "html.parser")
    results = hemispheres_soup.find_all("h3")
    # Use loop
    for r in results:
        title = result.text
        # print(title)
        #title without word "Enhanced"
        title = title[:-9]
        # print(title)
        browser.click_link_by_partial_text(title)
        img_url = browser.find_link_by_partial_href("download")["href"]
        # print(img_url)
        hemisphere_dicts = {"title": title, "img_url": img_url}
        hemisphere_img_urls.append(hemisphere_dicts)

        browser.visit(url)
    # Close the browser after scraping
    browser.quit()
    result["hemisphere_img_urls"] = hemisphere_img_urls
    # print("Got hemisphere images...")

    mars_data = {
        "title": title,
        "content": news_p,
        "featured_image_url": featured_image_url,
        "latest_weather": mars_weather,
        "image_data": hemisphere_img_urls,
    }
    existing = mars_collection.find_one()
    if existing:
        mars_data['_id'] = existing['_id']
        mars_collection.save(mars_data)
    else:
        mars_collection.save(mars_data)
    return mars_data
			element = browser.find_by_name('avc').first
			element.select('4')

			browser.fill('sic', '7759')
			browser.fill('totalSale', '10')

			igArray = ['99','98','95','90','87','85','83','80','77','75','73','70','65','60','40','30']

			for x in range(16):
				element = browser.find_by_name('obligorIgCode').first
				element.select(igArray[x])
				browser.find_by_name('UpdateButton').first.click()
				browser.driver.save_screenshot(typeaName + countryList[conIndex] + '_' + igArray[x] +'.png')

			browser.find_link_by_partial_href('/CNETCORP/cpmScenarios.do').first.click()

		else:	
			# create the new scenario
			if cType == 0:
				typeaName = 'Existing_Scotia_Public_'
			elif cType == 1:
				typeaName = 'Existing_Scotia_Private_'
			else:
				typeaName = 'Non_Scotia_Public_'

			browser.find_by_name('addScenario').first.click()
			browser.fill('scName', typeaName + countryList[conIndex])
			browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b')
			browser.type('scEffDate', '2014-10-31')
			browser.find_by_name('update').first.click()
Beispiel #37
-1
def getRoutes(start,end):
    browser = Browser(
        driver_name="firefox"
)
    browser.visit('https://www.hopstop.com/search?xfr=cityscape')
    print(browser.url)
    browser.fill('address1',str(start))
    browser.fill('address2',str(end))
    browser.find_by_name('get_dirs').click()
    print(browser.url)
    if browser.is_text_present('Did you mean?'):
        print "better at least get here"
        #browser.click_link_by_href("#") 
        for link in browser.find_link_by_href("#"):
            print "Okay"
            if link.visible == True:
                print link.text
                browser.click_link_by_text(link.text)
                break
    browser.click_link_by_href("#")
    links = browser.find_link_by_partial_href("/station?stid")
    results = []
    for link in links:
        results.append(link.value)
    browser.quit()
    return results