def scrape_links(steamName, gameName):
    """Scrape Steam's pages for data."""
    global allNames

    steam_page = _format_steam_url(steamName, gameName)

    page = urllib2.urlopen(steam_page)   
    print page.geturl()
    
    soup = bs(page.read(), 'html.parser')

    name = soup.find('a', attrs={'class':'whiteLink'})
    name = str(name.contents[0])
    
    achievements = soup.find('div', attrs={'id':'personalAchieve'})
    achievements = str(achievements).split('<br/><br/><br/>')
    achievements = achievements[0]
    achievements = bs(achievements, 'html.parser')
    achievements = achievements.find_all('h3')

    allNames[name] = {}
    allNames[name][gameName] = []

    for ach in achievements:
        allNames[name][gameName].append(str(ach.contents[0]).strip())
Beispiel #2
0
def get_chapters_list(url,opener=opener):
    #Find chapters urls and publish time for the chapter by going to the navigate page.
    #Returns tuple (chapter url, time)
    url_full = show_full_contents(url)
    chapters_list = []
    navigate = ''
    
    try:
        req = urllib2.Request(url_full)
        page = bs(opener.open(req))
        for link in page.find_all('a'):
            if 'Chapter Index' in link.text and len(link.get('href')) > 1:
                navigate = 'http://archiveofourown.org' + link.get('href')

        if navigate != '':
            req2 = urllib2.Request(navigate)
            page2 = bs(opener.open(req2))
            links = re.findall('<li><a href="(.*?)</span></li>', str(page2))
            for i in links:
                chapter_url = 'http://archiveofourown.org' + i.split('"')[0]
                chapter_index = re.findall('[0-9]+\.', i) [0].replace('.', '')
                chapter_time = re.findall('<span class="datetime">\((.*?)\)', i)[0]
                chapters_list.append((chapter_url, chapter_index, chapter_time))
    except:
        pass
    return chapters_list
 def definition(self,SearchWord):
     page=urllib2.urlopen("http://dictionary.reference.com/browse/%s"%(SearchWord.strip()))
     html=page.read()
     page=bs(html)       
     definition=page.find_all("div",attrs={"class":"dndata"})        
     defined=str(bs(str(definition)).get_text()).replace("[","").replace("]","").replace(":","")      
     return defined
Beispiel #4
0
    def test_nonstandard_youtube_stripped(self):
        """
        Test whether an embedded YouTube video that does not follow
        the standard options gets stripped as well.
        """
        from bs4 import BeautifulSoup as bs
        from website.utils.filters import filter_iframes
        self.maxDiff = None
        field_value_pre = """<div id="test">
<p>Wit amet interdum dolor felis ut ante. Morbi a facilisis ante, in lobortis urna. Etiam ut nunc quis libero interdum aliquam eu at magna. Nunc vehicula risus eleifend molestie vulputate. Mauris diam odio, congue eget lorem id, finibus imperdiet sem.</p>"""
        field_value_post = """<p>Vestibulum eget posuere metus, vel finibus leo. Suspendisse congue orci magna, in vestibulum lacus pulvinar a. Donec egestas, felis id feugiat tempus, orci velit ullamcorper risus, et ultricies augue arcu ullamcorper dolor. Mauris eget sollicitudin purus. Aenean a cursus risus, sit amet mattis erat. Curabitur vel venenatis sem. Cras non gravida tellus, eu egestas tellus. Morbi at lorem a turpis blandit vulputate vitae a est.</p></div>"""

        # First case: embed from a different URL
        field_value_different_src = field_value_pre + \
            """<iframe width="560" height="315" src="//www.youtub.com/embed/-Y6ImGzTF70"></iframe>""" + \
            field_value_post
        self.assertEqual(str(bs(field_value_pre + field_value_post, 'html.parser')),
                         filter_iframes(field_value_different_src))

        # Second case: embed using an attribute other than
        # the ones YouTube sets by default (width, height, src,
        # frameborders, allowfullscreen)
        field_value_different_attributes = field_value_pre + \
            """<iframe id="nonstandard" width="560" height="315" src="//www.youtube.com/embed/-Y6ImGzTF70"></iframe>""" + \
            field_value_post
        self.assertEqual(str(bs(field_value_pre + field_value_post, 'html.parser')),
                         filter_iframes(field_value_different_attributes))

        # Third case: iframe contains information.
        field_value_iframe_has_content = field_value_pre + \
            """<iframe width="560" height="315" src="//www.youtube.com/embed/-Y6ImGzTF70">Test Information</iframe>""" + \
            field_value_post
        self.assertEqual(str(bs(field_value_pre + field_value_post, 'html.parser')),
                         filter_iframes(field_value_iframe_has_content))
def linkExtractor(urltoopen, tag1, attrib1, attrib1value, tag2 ,attrib2, attrib2value, finalAttrib):
	url = urllib2.urlopen(urltoopen).read()
	soup = bs(url)
	lastPageTag = soup.find("span",{"class":"pagnDisabled"})
	lastPage = int(lastPageTag.getText())

	apple = []

	#inside the loop
	for j in range(0,lastPage):

		result = soup.findAll(tag1,{attrib1:attrib1value})

		for i in range(0,len(result)):
			resultDetails = result[i].find(tag2,{attrib2:attrib2value})
			link = resultDetails[finalAttrib]
			apple.append(link)

		nextLinkATag = soup.find("span",{"class":"pagnRA"})
		nextLink =  "http://www.amazon.com"+nextLinkATag.a['href']
		url = urllib2.urlopen(nextLink).read()
		soup = bs(url)

	#the loop ends

	return apple
Beispiel #6
0
def GetMyviUrl(url):
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
            'referer': url,
    }

    with requests.session() as s:
        # logging.basicConfig(level=logging.DEBUG) 
       # import time
        #_startTime = time.time()
        r = s.get(url)
        s.headers.update(headers)
        soup = bs(r.text)
        #print "Elapsed time: {:.3f} sec".format(time.time() - _startTime)
        url = soup.find('div', {'class':'player-area'}).find('iframe')['src']
        r = s.get(url, allow_redirects=True)
        UniversalUserID = r.cookies['UniversalUserID']
        js = bs(r.text).find('body').find('script', {'type': 'text/javascript'}).encode('utf-8')
        js = '{%s}' % (js.decode('utf-8').split('{', 1)[1].rsplit('}', 1)[0])
        js = re.sub(ur'([\s*{\s*,])([a-z]\w*):', ur'\1"\2":', js)
        js = js.replace("'", '"')
        json_data = json.loads(js)
        api = 'http://myvi.ru' + json_data['dataUrl']
        r = s.get(api)
        data = json.loads(r.text)
        url = data['sprutoData']['playlist'][0]['video'][0]['url']
        r = s.get(url, allow_redirects=False)
        return r.headers['Location'] + '|Cookie=' + urllib.quote_plus(urllib.urlencode({'UniversalUserID' : UniversalUserID }))
    return None
Beispiel #7
0
def Main(main_url):
    addDir('Поиск', site_url, mode="SEARCH")
    if main_url == None :
        main_url = site_url
        html = Get(main_url)
        soup = bs(html)
        content = soup.find('ul', {'class': 'main_menu'}).find_all('a', attrs={'class': 'main_menu_item_lnk'})
        for num in content:
            if 'news' not in num['href'] and 'deti' not in num['href'] :
                if 'sport' in num['href'] :
                    addDir(num.text, addUrlParams(site_url + num['href']), mode="CONTENT")
                else :
                    addDir(num.text, site_url + num['href'])
    else :
        print main_url
        cat = main_url.partition(site_url + '/')[-1].rpartition('?')[0]
        soup = bs(Get(main_url))
        if 'films' in main_url:
            content = soup.find('ul', attrs={'class': 'main_menu'}).find_all('li', attrs={'class': 'mseries_cont'})[1].find('div', {'class': 'submenu01_cont'}).find_all('a')
        elif 'series' in main_url:
            content = soup.find('ul', attrs={'class': 'main_menu'}).find_all('li', attrs={'class': 'mseries_cont'})[0].find('div', {'class': 'submenu01_cont'}).find_all('a')
        elif (cat in main_url) and (cat in categories):
            content = soup.find('ul', attrs={'class': 'main_menu'}).find('li', attrs={'class': 'm' + cat + '_cont'}).find('div', {'class': 'submenu01_cont'}).find_all('a')
        for num in content:
            label = num.text
            if label == '':
                label = 'ТНТ'
            addDir(label, addUrlParams(site_url + num['href']), mode="CONTENT")
def populateList():
    '''first, we get the whole list of pokemon, sorted by national dex number.
    there is also a regional dex number, which i will preserve later.
    returns a tuple in the form (name, url_suffix).
    '''
    path = URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
    page = wget(path)
    soup = bs(page.read(), 'html.parser')
    tables = soup.findAll('table')

    # - tables[1] is the list of kanto (kdex) pokemon.
    # - tables[2] is the list of jhoto (jdex) pokemon.
    # - tables[3] is the list of hoenn (hdex) pokemon.
    # - tables[4] is the list of sinnoh (sdex) pokemon.
    # - tables[5] is the list of unova (udex) pokemon.
    # - tables[6] is the list of kalos pokemon. kalos is special because the region is 
    #     split into 3 sub regions, central (cekdex), coastal (cokdex), and mountain (mokdex).
    # - tables[7] is the list of alola (adex) pokemon. it is not populated, as the region 
    #     is part of the gen VII game release (not released yet).

    # get a list of pokemon
    pokemon = []
    for table in tables[:7]:    # ignoring alola region for now
        entries = bs(table.__str__(), 'html.parser').findAll('tr')
        for entry in entries[1:]:   # entries[0] defines column headers.
            entry = bs(entry.__str__(), 'html.parser')
            info = entry.findAll('td')[3]
            poke = (info.a.contents[0], info.a['href'])
            if poke not in pokemon:     # there are duplicate entries. some pokemon have different "states".
                pokemon.append(poke)    # using a dictionary reorders, lets stay in order for debugging's sake.

    return pokemon
Beispiel #9
0
def GetVKUrl(url):
    http = GetHTML(url)
    soup = bs(http)
    soup = bs(GetHTML(soup.find('div', {'class':'b-video_player'}).find('iframe')['src']))
    sdata1 = soup.find('div', style="position:absolute; top:50%; text-align:center; right:0pt; left:0pt; font-family:Tahoma; font-size:12px; color:#777;")
    video = ''
    if sdata1:
        return False
    for rec in soup.find_all('param', {'name':'flashvars'}):
        for s in rec['value'].split('&'):
            if s.split('=', 1)[0] == 'url240':
                url240 = s.split('=', 1)[1]
            if s.split('=', 1)[0] == 'url360':
                url360 = s.split('=', 1)[1]
            if s.split('=', 1)[0] == 'url480':
                url480 = s.split('=', 1)[1]
            if s.split('=', 1)[0] == 'url720':
                url720 = s.split('=', 1)[1]
            if s.split('=', 1)[0] == 'hd':
                hd = s.split('=', 1)[1]
        video = url240
        qual = __settings__.getSetting('qual')
        if int(hd) >= 3 and int(qual) == 3:
            video = url720
        elif int(hd) >= 2 and (int(qual) == 2 or int(qual) == 3):
            video = url480
        elif int(hd) >= 1 and (int(qual) == 1 or int(qual) == 2):
            video = url360
    return video
Beispiel #10
0
def process_md(md_file, template_text):
    templ = bs(template_text)
    plain_doc = bs(markdown2.main(md_file))

    container = templ.select('#impress')[0]

    def new_step(i):
        new = bs('<div></div>')
        new.div['class'] = 'step'
        new.div['id'] = i
        return new.div

    i = 0
    current = new_step(i)
    for node in plain_doc.body.children:
        if not hasattr(node, 'name'):
            continue
        elif node.name == 'hr':
            i += 1
            container.append(current)
            current = new_step(i)
        else:
            current.append(node)
    container.append(current)
    return templ
def getContent(u, p):
    result = []
    driver = webdriver.Firefox()
    url = u
    driver.get(url)
    for i in xrange(1,p+1):
        element = WebDriverWait(driver, 1).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".js-layout-option.full"))
        )
        res =  driver.page_source.encode('utf-8')
        soup = bs(res, 'html.parser')
        n = len(soup.select('.middle-header'))
        time.sleep(0.5)
        driver.find_element_by_css_selector(".js-layout-option.full").click()
        res = driver.page_source
        soup = bs(res, 'html.parser')
        for t in soup.select('.review-container'):
            temp = re.sub('<img.*>','',t.text)
            s = "".join(temp.split()).strip()
            result.append(s)
        if i < p:
            nextpage = driver.find_element_by_css_selector(".pagination-button.next.js-next")
            nextpage.click()
        else:
            driver.quit()
    return result
Beispiel #12
0
        def spoj():
            """
            Scrapes problems from spoj.com
            (Uses default template)

            """
            url = "http://spoj.com" # Start with the domain name
            self.problem = str.upper(self.problem)
            url = url+"/problems/"+self.problem+'/'
            
            print "Pinging up spoj...."
            
            self.page_as_string = Utilities.get_html(url)
            soup = bs(self.page_as_string)
            
            p_header = soup.find('h2',{'id':'problem-name'})
            p_container = soup.find('div',{'id':'problem-body'})
            self.problem_container = p_container
            self.problem_container_as_string = str(p_container)
            
            self.page_as_string = StaticScraper.setTemplate\
            (str(p_header),self.problem_container_as_string)

            
            self.page_as_soup = bs(self.page_as_string)
Beispiel #13
0
def GetSRUrl(html):
    soup = bs(html, "html.parser")
    sr_url = 'http:' + soup.find('div', {'class':'b-video_player'}).find('iframe')['src']
    soup = bs(GetHTML(sr_url), "html.parser")
    source = soup.find('video').find('source')['src']
    
    return source
Beispiel #14
0
def scrape_waymarks(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'}

    response = requests.get(url, headers= headers)
    soup = bs(response.text, "lxml")

    links = soup.select('.wmd_namebold > a')
    links = links[1::2]
    links = [link['href'] for link in links]

    for link in links:
        response = requests.get(link, headers= headers)
        soup = bs(response.text, "lxml")

        # Do some messy parsing and decoding to extract coordinates and other landmark details
        coords = soup.select('#wm_coordinates')[0].get_text().encode('ascii','ignore').replace('.','').split()

        latitude = float('.'.join([coords[1], coords[2]]))
        longitude = -(float('.'.join([coords[4], coords[5]])))

        title = soup.select('#wm_name')[0].get_text().split(' - ')[0].encode('ascii', 'replace').strip()
        artist = soup.select('#Table1')[0].get_text('|', strip=True).split('|')[5]
        # details = soup.select('#Table1')[0].get_text('|', strip=True).split('|')[7]
        details = soup.select('#wm_quickdesc')[0].get_text().split(': ')[1]
        image_url = soup.select('.wm_photo > a > img')[0]['src']

        print "{}|{}|{}|{}|{}|{}".format(latitude, longitude, title, artist, details, image_url)
def vid_info(url,cookie):
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
	data = opener.open(urllib2.Request(url)).read()
	soup = bs(data)
	vid_title = soup.findAll("h1",class_="single-pagetitle")[0].string
	# print vid_title
	vid_link = soup.findAll("div",class_="video-embed-container")[0].iframe
	rq = urllib2.Request(vid_link['src'])
	rq.add_header('Referer',url)
	if_src = bs(opener.open(rq).read()).prettify()
	m3u8_link = re.findall("\"hd\":\"(.*?)\"},",if_src)[0]
	rq2 = urllib2.Request(m3u8_link)
	rq2.add_header('Referer',vid_link["src"])
	new_data = opener.open(rq2).read()
	# print new_data
	ind_link = 'http://'+re.findall(r"http://(.*?).m3u8",new_data)[0]+".m3u8"
	# print ind_link
	rq3 = urllib2.Request(ind_link)
	rq3.add_header('Referer',m3u8_link)
	seg_data = opener.open(rq3).read()
	# print seg_data
	segs = re.findall("http://(.*?).ts",seg_data)
	for i in xrange(len(segs)):
		segs[i] = 'http://'+segs[i]+'.ts'

	# sys.exit(1)
	return [vid_title,segs]
def scrape_zips(zips):
    all_pages = []
    pages_completed = 0
    start_time = time.time()
    path = 'data'
    try:
        os.chdir(path)
    except WindowsError or IOError:
        pass
    #connect to sql db
    conn = sqlite3.connect('housing_data.db')
    c = conn.cursor()
    #start parsin some zips
    #### figure out how many pages for each zip code and make a list of all the pages ###
    for z in zips:
        base_page = 'http://www.trulia.com/for_rent/'+str(z)+'_zip/'
        soup = bs(requests.get(base_page).text,'html5lib')
        #create list of pages to scrape
        pages = [base_page]
        #create soup of area to look for number of pages
        if len(soup.find_all(class_='srpPagination_list')) == 0:
            pass
        else:
            pages_area = soup.find_all(class_='srpPagination_list')
            try:
                number_of_pages= int(bs(str(pages_area)).find_all('a')[-1].text)
                for i in range(2,number_of_pages+1):
                    pages.append(base_page + str(i)+'_p')
            except IndexError:
                number_of_pages = 1
            all_pages= all_pages + pages
            print('zip: ' + str(z) + ' added to job.  ~Listings: '+ str(number_of_pages*30))
    
    ##### go through each page and make it into some soup ####
    print('total pages to scrape: ' + str(len(all_pages)))
    time.sleep(2)
    for page in all_pages:
        soup = bs(requests.get(page).text,'html5lib')
        mylist = soup.find_all(class_='property-data-elem')
        ##### add listings for each page to the database ###
        for listing in mylist:
            home = Property(listing)
            if home.type == 'single':
                c.execute("INSERT INTO rental_data\
                (Longitude, Latitude, Address, Zip, Price, RoomType, Bathrooms, Sqft, Date_Scraped)\
                VALUES(?,?,?,?,?,?,?,?,?)",home.output())
            else:
                for apt in home.units:
                    c.executemany("INSERT INTO rental_data\
                    (Longitude, Latitude, Address, Zip, Price,RoomType, Bathrooms, Sqft, Date_Scraped)\
                    VALUES(?,?,?,?,?,?,?,?,?)",home.output())
          
        print("--- %s seconds ---" % (time.time() - start_time))
        pages_completed +=1
        pages_remaining = len(all_pages)-pages_completed
        print('number of pages remaining: ' + str(pages_remaining)\
        + ' . ~Minuntes to completion: ' + str(pages_remaining*2/60))
    conn.commit()   
    os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir))
    return ("--- %s seconds ---" % (time.time() - start_time))
Beispiel #17
0
def tam_data():
	s = requests.Session()
	d = s.get('http://www.tam.com.br')
	p = bs(d.text, 'html.parser')

	cookies = {
	}
	
	form = {
		'WDS_CORPORATE_SALES': 'FALSE',
		'SITE': 'JJBKJJBK',
		'LANGUAGE': 'BR',
		'WDS_MARKET': 'BR',
		'FROM_PAGE': 'HOME_SEARCH',
		'B_DATE_1': '201603110000',
		'B_DATE_2': '201603140000',
		'B_LOCATION_1': 'POA',
		'E_LOCATION_1': 'CGH',
		'WDS_FORCE_SITE_UPDATE': 'TRUE',
		'FORCE_OVERRIDE': 'TRUE',
		'TRIP_TYPE': 'R',
		'search_from': 'Porto+Alegre+-+Salgado+Filho+Internacional+(POA)',
		'search_to': 'Sao+Paulo+-+Congonhas+(CGH)',
		'adults': '2',
		'children': '0',
		'infants': '0',
		'CORPORATE_CODE_INPUT': '',
		'SEACH_COOKIE': '"{"bounds":[null,null,null,null,null,null,null,null,null,null,{"bLocation":"POA","eLocation":"CGH","bDate":"201603110000"},{"bDate":"201603140000"}],"roundtripCommon":{"tripType":"R","adults":"2","children":"0","infants":"0","mcabin":null}}"'
	}

	d = s.post('http://book.tam.com.br/TAM/dyn/air/booking/upslDispatcher;jsessionid=dh9csky6V5pDct8lcQcV_TZaedKzD6Z2LOj4Gg8GH5qvYoRIRXp_!1618028954!549751287', data=form)
	p = bs(d.text, 'html.parser')
	return p
Beispiel #18
0
def statigr(self):
	'''simple extraction in json of statigram 
	url should begin by http://statigr.am/ to work 
	json data:
	* user:
		- url
		- stats : media, follower, following
	* data (list of every image limit to first load):
		-type
		-url
		- stats: like, comment, favorite pict
	=> Should directly take from instagram API :p
	'''
	user_stats_list = zip(bs(self._content).findAll("span",{"class":"chiffre"}), bs(self._content).findAll("span",{"class":"legende"}))
	img_details = bs(self._content).findAll("div", {"id":re.compile('^detailPhoto.*?$')})
	#je mets à la ligne parce qu'on me dit que c'est illisible
	self._values['name'] = "instagram"
	self._values['stats'] = dict((y.get_text(), int(x.get_text())) for x, y in user_stats_list)
	self._values["details"] =[{"img":{
								'type': 'img',
								'url':n.find('img')['src'],
								'stats':dict((img.get('class')[0], txt2int(img.get_text())) for img in n.findAll('span'))
								}
								}
								for n in img_details
								]
		
	return self 		
def set_reserve_param(tr):
    param = dict(reserve_param)

    train_info_list = bs(str(tr), 'html.parser').select("td.trnNo > input")
    train_info_dict = { bs(str(info), 'html.parser').find()['name'].split('[')[0]: bs(str(info),'html.parser').find()['value'] for info in train_info_list }

    param['dptDt1'] = train_info_dict['dptDt']
    param['runDt1'] = train_info_dict['runDt']
    param['arvStnConsOrdr1'] = train_info_dict['arvStnConsOrdr']
    param['arvStnRunOrdr1'] = train_info_dict['arvStnRunOrdr']
    param['arvRsStnCd1'] = train_info_dict['arvRsStnCd']
    param['dirSeatAttCd1'] = '000'
    param['dptRsStnCd1'] = train_info_dict['dptRsStnCd']
    param['dptStnConsOrdr1'] = train_info_dict['dptStnConsOrdr']
    param['dptTm1'] = train_info_dict['dptTm']
    param['jrnySqno1'] = train_info_dict['jrnySqno']
    param['locSeatAttCd1'] = "000"
    param['reqTime'] = int(time.time()*1000) #현재시간
    param['rqSeatAttCd1'] = train_info_dict['seatAttCd']
    param['stlbTrnClsfCd1'] = train_info_dict['stlbTrnClsfCd']
    param['trnGpCd1'] = train_info_dict['trnGpCd']
    param['trnNo1'] = train_info_dict['trnNo']
    param['trnOrdrNo1'] = train_info_dict['trnOrdrNo'] #화면에서 몇번째 라인에 있던 열차인지

    return param
Beispiel #20
0
    def search_author_publication(self, author_id, show=True, verbose=False):
        #{{{ search author's publications using authid
        #TODO: Verbose mode

        '''
            Search author's publication by author id
            returns a list of dictionaries
        '''
        url = self._search_url_base + 'apikey={}&query=au-id({})&start=0&httpAccept=application/xml'.format(self.apikey, author_id)
        soup = bs(urlopen(url).read(), 'lxml')
        total = float(soup.find('opensearch:totalresults').text)
        print 'A toal number of ', int(total), ' records for author ', author_id
        starts = np.array([i*25 for i in range(int(np.ceil(total/25.)))])

        publication_list = []
        for start in starts:
            search_url = self._search_url_base + 'apikey={}&start={}&query=au-id({})&httpAccept=application/xml'.format(self.apikey, start, author_id)
            results = bs(urlopen(search_url).read(), 'lxml')
            entries = results.find_all('entry')
            for entry in entries:
                publication_list.append(_parse_xml(entry))

        if show:
            #pd.set_printoptions('display.expand_frame_repr', False)
            #print df['title'].to_string(max_rows=10, justify='left')
            df = pd.DataFrame(publication_list)
            titles = np.array(df['title'])
            for i in range(titles.size):
                t = trunc(titles[i])
                print '%d)' %i, t
        # }}}
        return publication_list
Beispiel #21
0
def GetVKUrl(html):
    soup = bs(html, "html.parser")
    vk_url = 'http:' + soup.find('div', {'class':'b-video_player'}).find('iframe')['src']
    soup = bs(GetHTML(vk_url), "html.parser")
    video = ''
    js = soup.find_all('script', {'type': 'text/javascript'})[-1].encode('utf-8')
    p = re.compile('var vars = (.*?);')
    js = p.findall(js)
    json_data = json.loads(js[0])
    if 'url240' in json_data:
        url240 = json_data['url240']
    if 'url360' in json_data:
        url360 = json_data['url360']
    if 'url480' in json_data:
        url480 = json_data['url480']
    if 'url720' in json_data:
        url720 = json_data['url720']
    if 'hd' in json_data:
        hd = json_data['hd']
    video = url240
    qual = __settings__.getSetting('qual')
    if int(hd) >= 3 and int(qual) == 3:
        video = url720
    elif int(hd) >= 2 and (int(qual) == 2 or int(qual) == 3):
        video = url480
    elif int(hd) >= 1 and (int(qual) == 1 or int(qual) == 2):
        video = url360
    return video
Beispiel #22
0
def GetMyviUrl(html, url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0", "referer": url}

    with requests.session() as s:
        # logging.basicConfig(level=logging.DEBUG)
        # import time
        # _startTime = time.time()
        # r = s.get(url)
        s.headers.update(headers)
        soup = bs(html, "html.parser")
        # print "Elapsed time: {:.3f} sec".format(time.time() - _startTime)
        url = soup.find("div", {"class": "player-area"}).find("iframe")["src"]
        url = "http:" + url
        r = s.get(url, allow_redirects=True)
        UniversalUserID = r.cookies["UniversalUserID"]
        js = bs(r.text, "html.parser").find("body").find("script", {"type": "text/javascript"}).encode("utf-8")
        js = "{%s}" % (js.decode("utf-8").split("{", 1)[1].rsplit("}", 1)[0])
        js = re.sub(r"([\s*{\s*,])([a-z]\w*):", r'\1"\2":', js)
        js = js.replace("'", '"')
        json_data = json.loads(js)
        api = "http://myvi.ru" + json_data["dataUrl"]
        r = s.get(api)
        data = json.loads(r.text)
        url = data["sprutoData"]["playlist"][0]["video"][0]["url"]
        r = s.get(url, allow_redirects=False)
        return (
            r.headers["Location"]
            + "|Cookie="
            + urllib.quote_plus(urllib.urlencode({"UniversalUserID": UniversalUserID}))
        )
    return None
Beispiel #23
0
def clean_html(html_url):
	html_content = get_html_content(html_url)
	if html_content is None:
		return None
	soup = bs(html_content, 'lxml').find('body')
	if soup is None:
		p1 = html_content.find('<body')
		p2 = html_content.find('</body>')
		if p1 < 0 or p2 < 2:
			return None
		soup = bs(html_content[p1: p2+7], 'lxml')
	if soup is None:
		return None
	to_extract = soup.findAll('script')
	for it in to_extract:
		it.extract()
	res = soup.get_text()\
		.replace('\n', '')\
		.replace('\t', '')\
		.replace('\r', '')\
		.replace('百度', '')\
		.strip()
	res = res[160:]
	res = res[:-200]
	return res
 def YeniSurumTespit(self):
     bit = self.Bilgi_Al()
     if bit == "i686":
         Kodlar = urllib.urlopen('https://www.mozilla.org/tr/firefox/new/').read()
         KARISTIR = bs(Kodlar)
         Bul1 = KARISTIR.find('li', {"class" : "os_linux"})
         KARISTIR = bs(str(Bul1))
         Bul = KARISTIR.find('a', {"class" : "download-link"})
         SATIR = str(Bul).split('\n')[0]
     
         for i in SATIR.split(' '):
             if 'href=' in i:
                 Adres = i        
         Adres = Adres.replace('href=', '', 1)
         Adres = Adres.replace('>', '', -1)
         Adres = Adres[1: -1].replace('amp;', '')
         return Adres
     elif bit == "x86_64":
         Kodlar = urllib.urlopen('https://www.mozilla.org/tr/firefox/new/').read()
         KARISTIR = bs(Kodlar)
         Bul1 = KARISTIR.find('li', {"class" : "os_linux64"})
         KARISTIR = bs(str(Bul1))
         Bul = KARISTIR.find('a', {"class" : "download-link"})
         SATIR = str(Bul).split('\n')[0]
         
         for i in SATIR.split(' '):
             if 'href=' in i:
                 Adres = i
         
         Adres = Adres.replace('href=', '', 1)
         Adres = Adres.replace('>', '', -1)
         Adres = Adres[1: -1].replace('amp:', '')
         return Adres
    def scrape_phone_numbers(self):
	"""
	Scrape all phone numbers from the currently open page and save them to self.numbers.
	"""
	all_numbers = {}
	try:
	    soup = bs(self.main_driver.page_source)
	except selenium.common.exceptions.UnexpectedAlertPresentException:
	    try:
		alert = self.main_driver.switch_to_alert()
		alert.accept()
		soup = bs(self.main_driver.page_source)
	    except Exception as e:
		logger.error("Exception (%s) triggered when extracting source from (%s)" % (e, self.main_driver.current_url) )
		return False
	except Exception as e:
	    logger.error("Exception (%s) triggered when extracting source from (%s)" % (e, self.main_driver.current_url) )
	    return False
	extracted_strings = soup.find_all(lambda x: x.name != 'script' and x.name != 'style' and x.name != 'noscript' and x.name != 'iframe', text=lambda x: True)
	for extracted_string in extracted_strings:
	    for extracted_number in phone_re.findall(extracted_string.text):
		extracted_number = '-'.join(extracted_number).encode('ascii', 'ignore')
		extracted_number = re.sub('-{2,}|\A-|-\Z', '', extracted_number )
		if len(extracted_number) >= 12:
		    all_numbers[extracted_number] = extracted_number
	if len(all_numbers):
	    logger.info("Found %s phone numbers at (%s):\n%s" % (len(all_numbers), self.main_driver.current_url, all_numbers.values()) )
	    return all_numbers.values()
	else:
	    logger.debug("Found %s phone numbers at (%s)" % (len(all_numbers), self.main_driver.current_url) )
	    return False
Beispiel #26
0
def Percussion (testkey):
    stafflist = map(lambda x :x.keys()[0],testkey)
    stafflist = list(set(stafflist))
    timesigN = 4
    Division = 480
    newM = bs('')
    for stfno in range(0,len(stafflist)):
        a = filter(lambda a : a if a.keys()[0] in stafflist[stfno] else None ,testkey)
        no = stafflist[stfno].split('ff')[-1]
        newM.append(newM.new_tag('Staff',id = no))
        mes=bs('');i=1
        for x in a:
            bag =bs('')
            mes.append(mes.new_tag('Measure',number = str(i)))
            s = x[stafflist[stfno]]
            tkno =0                               # track number = tkno
            tkvalue = s.pop('track0')
            bag.append(PercussionUnit(tkno,tkvalue))
            if len(s) > 0:
                tick = bag.new_tag('tick')       # 建構Division tag
                tick.string= str(Division*timesigN*(i-1))               # Division 480 * timesigN * measure number
                bag.append(tick)
            for track in s.keys():
                tkno = int(track.split('k')[-1])
                tkvalue = x[stafflist[stfno]][track]
                bag.append(PercussionUnit(tkno,tkvalue))
            mes.select('Measure')[i-1].append(bag)
            i += 1
        newM.select('Staff')[stfno].append(mes)
    return newM
Beispiel #27
0
def Tchord(tkno,x):
    ccc = bs("")
    if (x[0][0] == str(0) and x[1]=='whole'):
        TagR = bs("")
        TagR.append(TagR.new_tag("Rest"))
        if tkno > 0 :
            track = TagR.new_tag("track")
            track.string= str(tkno)
            TagR.Rest.append(track)
        durT = TagR.new_tag("durationType")
        durT.string = "measure"
        TagR.Rest.append(durT)
        ccc.append(TagR)
    elif x[0][0]== str(0):
        Rtag = bs("")
        Trest = Rtag.append(Rtag.new_tag("Rest"))
        if tkno > 0 :
            track = Rtag.new_tag("track")
            track.string= str(tkno)
            Rtag.Rest.append(track)
        durT = Rtag.new_tag("durationType")
        durT.string = x[1]
        Rtag.Rest.append(durT)
        ccc.append(Rtag)
    else:
        Ctag = bs("")
        Tcho = Ctag.append(Ctag.new_tag("Chord"))
        if tkno > 0 :
            track = Ctag.new_tag("track")
            track.string= str(tkno)
            Ctag.Chord.append(track)
            ccc.append(Ctag)
        if x[2] >0:
            Tdot = Ctag.new_tag("dots")
            Tdot.string = str(x[2])
            Ctag.Chord.append(Tdot)
        durT = Ctag.new_tag("durationType")
        durT.string = x[1]
        Ctag.Chord.append(durT)
        ccc.append(Ctag)
        for i in range(0,len(x[0])):
            Tnote = Ctag.Chord.append(Ctag.new_tag("Note"))
            if tkno > 0 :
                track = Ctag.new_tag("track")
                track.string= str(tkno)
                Ctag.Chord.select('Note')[i].append(track)
            Tpitch = Ctag.new_tag("pitch")
            Tpitch.string= str(x[0][i])
            Ctag.select('Note')[i].append(Tpitch)
            Ttpc = Ctag.new_tag("tpc")
            Ttpc.string="22"
            Ctag.select('Note')[i].append(Ttpc)
            Tvelo = Ctag.new_tag("velocity")
            Tvelo.string="100"
            Ctag.select('Note')[i].append(Tvelo)
            TvT = Ctag.new_tag("veloType")
            TvT.string="user"
            Ctag.select('Note')[i].append(TvT)
            ccc.append(Ctag)
    return ccc
Beispiel #28
0
def GetVKUrl(html):
    soup = bs(html, "html.parser")
    vk_url = "http:" + soup.find("div", {"class": "b-video_player"}).find("iframe")["src"]
    soup = bs(GetHTML(vk_url), "html.parser")
    video = ""
    js = soup.find_all("script", {"type": "text/javascript"})[-1].encode("utf-8")
    p = re.compile("var vars = (.*?);")
    js = p.findall(js)
    json_data = json.loads(js[0])
    if "url240" in json_data:
        url240 = json_data["url240"]
    if "url360" in json_data:
        url360 = json_data["url360"]
    if "url480" in json_data:
        url480 = json_data["url480"]
    if "url720" in json_data:
        url720 = json_data["url720"]
    if "hd" in json_data:
        hd = json_data["hd"]
    video = url240
    qual = __settings__.getSetting("qual")
    if int(hd) >= 3 and int(qual) == 3:
        video = url720
    elif int(hd) >= 2 and (int(qual) == 2 or int(qual) == 3):
        video = url480
    elif int(hd) >= 1 and (int(qual) == 1 or int(qual) == 2):
        video = url360
    return video
def scrapeTweets(matchPattern,fromTime,untilTime): #generator of tweets
  #definitions for json structure returned from twitter search api:
  #initialisation of return values:
  num_tweets_scraped = 0
  finished = False

  while not finished:
    #print "Query times: " + str(fromTime) + " to " + str(untilTime)
    print >> sys.stderr, "Query times: " + str(fromTime) + " to " + str(untilTime)
    query = ts.buildQuery(matchPattern,fromTime,untilTime)
    response = urllib2.urlopen(query)
    data = json.load(response)
    soup = bs(str(data))

    tweet_soups = ts.get_tweet_soups(soup)
    num_tweets_scraped = num_tweets_scraped + len(tweet_soups)

    for tweet_soup in tweet_soups:
      if not re.search('Related Searches:',str(bs(str(tweet_soup)))):
        yield ts.buildTweet(tweet_soup)

    if len(tweet_soups):
      untilTime = ts.getTime(tweet_soups[-1]) #get time of last tweet
    else:
      print >> sys.stderr, 'Finished getting all tweets, total: ' + str(num_tweets_scraped)
      finished = True
Beispiel #30
0
def mitocheck(gene, screens=('Mitocheck primary screen', 
                             'Mitocheck validation screen'),
             limit=10, substitute='(),'):
    """Search Mitocheck database for given gene name (or Ensembl id)
    and return DataFrame containing download links.
    """
    bsurl = lambda x: bs(urllib2.urlopen(x).read())
    
    request = 'http://mitocheck.org/cgi-bin/mtc?query=%s' % gene
    x = bsurl(request)
    y = x.find(title='List all movies/images associated with this gene')
    if y is None:
        print 'zero or multiple entries for', gene
        return None
    z = bsurl('http://mitocheck.org' + y['href'])
    df = pd.read_html(str(z.find('table')), header=0)[2].dropna(how='all')
    df = df[df['Source'].isin(screens)]
    df = df.groupby('Source').head(10)

    for ix, movie_id in df['Movie/Image ID'].iteritems():
        request = 'http://mitocheck.org/cgi-bin/mtc?action=show_movie;query=%s' % movie_id 
        x = bs(urllib2.urlopen(request).read())
        df.loc[ix, 'link'] = x.find_all('a', text=u'Download this movie')[0]['href']
        movie_id = int(movie_id)
        tmp = (df.loc[ix, 'link'].split('/')[-1]
                             .replace('.avi', '.%d.avi' % movie_id))
        df.loc[ix, 'avi'] = ''.join([c if c not in substitute else '_' for c in tmp])

        
    return df.drop(df.columns[0], axis=1)
Beispiel #31
0
def scrape():
    """scrapes everything above"""

    all_data = []

    #MARS NEWS

    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    response = req.get(url)
    soup = bs(response.text, 'html.parser')
    results = soup.find_all('div', class_="image_and_description_container")
    counter = 0
    for result in results:
        if counter == 0:
            news_p = result.find('div',
                                 class_="rollover_description_inner").text
            counter += 1
    all_data.append({"news_p": news_p})

    #JPL IMAGES

    executable_path = {
        "executable_path":
        'C:\\Users\\dan.brueckman\\Desktop\\chromedriver.exe'
    }
    jpl_link_main = 'www.jpl.nasa.gov'
    browser = Browser('chrome', **executable_path, headless=True)
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)
    browser.click_link_by_id('full_image')
    html = browser.html
    soup = bs(html, 'html.parser')
    jpl_results = soup.find_all('a', class_="button fancybox")
    counter = 0
    for result in jpl_results:
        if counter == 0:
            featured_image_url = jpl_link_main + result['data-fancybox-href']
            counter += 1
    all_data.append({"featured_image_url": featured_image_url})

    #MARS WEATHER

    weather_url = 'https://twitter.com/marswxreport?lang=en'
    response = req.get(weather_url)
    soup = bs(response.text, 'html.parser')
    weather_results = soup.find_all('p', class_="TweetTextSize")
    counter = 0
    for result in weather_results:
        if counter == 0:
            mars_weather = result.text
            counter += 1
    all_data.append({"weather": mars_weather})

    #MARS FACTS

    facts_url = 'https://space-facts.com/mars/'
    tables = pd.read_html(facts_url)
    facts_df = tables[0]
    facts_df.rename(columns={0: "Profile", 1: "Attributes"})
    html_table = facts_df.to_html()
    all_data.append({"html_table": html_table})

    #MARS HEMISPHERES

    astro_link = 'https://astropedia.astrogeology.usgs.gov'
    hem_links = [
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'
    ]
    browser = Browser('chrome', **executable_path, headless=True)
    hemisphere_image_urls = []
    for link in hem_links:
        browser.visit(link)
        img = browser.find_link_by_partial_href('.tif/full.jpg')
        img_url = img['href']
        response = req.get(link)
        soup = bs(response.text, 'html.parser')
        result = soup.find('h2', class_='title')
        img_title = result.text
        hemisphere_image_urls.append({"title": img_title, "img_url": img_url})
    all_data.append({"hemisphere_images": hemisphere_image_urls})
    return all_data
Beispiel #32
0
import json

# 2. Extrair o conteúdo HTML a partir da URL -----------------------------------------------------------
url_busca_uf = 'http://www.buscacep.correios.com.br/sistemas/buscacep/resultadoBuscaFaixaCEP.cfm'
ufs = [
    'AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG', 'MS',
    'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 'RJ', 'RN', 'RO', 'RR', 'RS', 'SC',
    'SE', 'SP', 'TO'
]

for uf in ufs:
    payload = {'UF': uf}
    pagina = requests.post(url_busca_uf, payload)

    # 3. Parsear o conteúdo HTML utilizando a biblioteca BeautifulSoup ---------------------------------
    soup = bs(pagina.text, 'html.parser')

    # 4. Estruturar o conteúdo em um Data Frame utilizando a biblioteca Pandas -------------------------
    table_estados = soup.find_all(name='table')[0]
    df_estados = pd.read_html(str(table_estados))[0]
    df_estados_salvar = df_estados[['UF', 'Faixa de CEP']]

    table_localidades = soup.find_all(name='table')[1]
    df_localidades = pd.read_html(str(table_localidades))[0]
    df_localidades_salvar = df_localidades[[
        'Localidade', 'Faixa de CEP', 'Situação', 'Tipo de Faixa'
    ]]

    # 5. Trasnformar os dados em um dicionário de dados próprio ----------------------------------------
    dict_estados_salvar = {}
    dict_estados_salvar['Estados'] = df_estados_salvar.to_dict('records')
Beispiel #33
0
def scrape(html):
    soup = bs(html, 'html.parser')
    items = soup.select('.topicsListItem')

    return items
Beispiel #34
0
import requests
from bs4 import BeautifulSoup as bs

response = requests.get('https://www.hltv.org/matches')

soup = bs(response.text, 'lxml')


class MatchParser():
    days_matches = []

    def get_matches_in_dicts(self):
        for match_day in soup.findAll('div', {'class': 'match-day'}):
            for match in match_day.findAll('div', {'class': 'match'}):
                selected_match = []
                for child in match.recursiveChildGenerator():
                    if child.name == 'td':
                        selected_match.append(child.text.strip())
                self.days_matches.append({
                    match_day.find('span', {
                        'class': 'standard-headline'
                    }).text:
                    selected_match
                })

    def return_matches_in_dicts(self):
        return (self.days_matches)

    def print_matches(self):
        for i in self.days_matches:
            for j, k in i.items():
Beispiel #35
0
def scrape():
    browser = init_browser()

    # Create a dictionary for all of the scraped data
    mars_dat = {}

    # Visit the Mars news page.
    news_url = "https://mars.nasa.gov/news/"
    browser.visit(news_url)

    # Search for news
    # Scrape page into soup
    html = browser.html
    soup = bs(html, 'html.parser')

    # Find the latest Mars news.
    article = soup.find("div", class_="list_text")
    news_content = article.find("div", class_="article_teaser_body").text
    news_title = article.find("div", class_="content_title").text
    news_date = article.find("div", class_="list_date").text

    # Add the news date, title and summary to the dictionary
    mars_dat["news_date"] = news_date
    mars_dat["news_title"] = news_title
    mars_dat["summary"] = news_content

    # While chromedriver is open go to JPL's Featured Space Image page.
    JPL_url = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(JPL_url)

    # Scrape the browser into soup and use soup to find the full resolution image of mars
    # Save the image url to a variable called `featured_image_url`
    html = browser.html
    soup = bs(html, 'html.parser')
    image = soup.find("img", class_="thumb")["src"]
    img_url = "https://jpl.nasa.gov" + image
    featured_image_url = img_url
    # Add the featured image url to the dictionary
    mars_dat["featured_image_url"] = featured_image_url

    # ## Mars Weather
    twitter_url = "https://twitter.com/marswxreport?lang=en"

    browser.visit(twitter_url)
    tweets = browser.html
    tweets_soup = bs(tweets, 'html.parser')
    Marsweather = tweets_soup.find("div", class_="js-tweet-text-container")
    Mars_weat = Marsweather.text
    marswed2 = Mars_weat.replace('\n', ' ')
    # Add the weather to the dictionary
    mars_dat["marswed2"] = marswed2

    # ## Mars Facts

    mars_facts = "http://space-facts.com/mars/"
    browser.visit(mars_facts)

    import pandas as pd
    mars_facts_todf = pd.read_html(mars_facts)
    mars_data = pd.DataFrame(mars_facts_todf[0])
    mars_data.columns = ['Mars', 'Data']
    mars_table = mars_data.set_index("Mars")
    marsdata = mars_table.to_html(classes='marsdata')
    marsdata = marsdata.replace('\n', ' ')

    # Add the Mars facts table to the dictionary
    mars_dat["marsdata"] = marsdata

    # Visit the USGS Astogeology site and scrape pictures of the hemispheres
    USGS_link = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(USGS_link)
    import time
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_pictures = []

    for i in range(4):
        time.sleep(5)
        images = browser.find_by_tag('h3')
        images[i].click()
        html = browser.html
        soup = bs(html, 'html.parser')
        partial = soup.find("img", class_="wide-image")["src"]
        img_title = soup.find("h2", class_="title").text
        img_url = 'https://astrogeology.usgs.gov' + partial
        dictionary = {"title": img_title, "img_url": img_url}
        mars_pictures.append(dictionary)
        browser.back()

    mars_dat["mars_pictures"] = mars_pictures

    print(mars_dat)
    print("this is the type: ", type(mars_dat))
    # Return the dictionary
    return mars_dat
myCursor = myDB.cursor()


def selectAllDigimon():
    sql = 'select * from digimon'
    myCursor.execute(sql)
    x = myCursor.fetchall()
    print(type(x))
    for data in x:
        print(data)


url = "http://digidb.io/digimon-list/"
dataDigimon = requests.get(url).content
# print(dataultra)
dataDigimon = bs(dataDigimon, 'html.parser')

listDigi = []
temList = []
for i in dataDigimon.find_all('td'):
    cek = str(i.text)
    print(cek, end=' ')
    if cek[0] == ' ':
        print('y')

    temList.append(cek.replace('\xa0', ''))  # replace some noise character
    counter += 1
    # get each row by counting the column. end the nCols is 13
    if (counter % 13 == 0):
        listDigi.append(temList)
        temList = []
import requests 
from bs4 import BeautifulSoup as bs
import re


import matplotlib.pyplot as plt
from wordcloud import WordCloud

#creating an empty review list
redmi_reviews = []

for i in range (1,21):
    ip=[]
    url ="https://www.amazon.in/Redmi-Pro-Blue-64GB-Storage/product-reviews/B07DJHR5DY/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber="+str(i)
    response = requests.get(url)
    soup = bs(response.content,"html.parser")
    reviews = soup.findAll("span",attrs = {"class","a-size-base review-text review-text-content"})
    for i in range(len(reviews)):
        ip.append(reviews[i].text)
    redmi_reviews = redmi_reviews+ip
    
##Writing reviews in a text file
with open("redmi.txt","w",encoding = 'utf-8') as output:
    output.write(str(redmi_reviews))
    
import os
os.getcwd()

## Joining all the reviews into single paragraph
red_rev_string = " ".join(redmi_reviews)
Beispiel #38
0
from sqlalchemy import create_engine

from flask import Flask, jsonify, render_template
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)

## Data scrape:  pull latest fire
## commenting out due to trouble with heroku loading beautiful soup
## note: in order to restore, also modify the "/" route & the index page

url_incident = "https://inciweb.nwcg.gov/feeds/rss/incidents/"
   
r = requests.get(url_incident)

soup = bs(r.text, "lxml-xml")
title = soup.find_all("title")
date = soup.find_all("pubDate")

print("Latest Fire Information")
title = title[1].text
date = date[0].text

  

## Setting up flask routes



@app.route("/")
def index():
Beispiel #39
0
def get_soup(url):
    raw = remove_non_ascii(get(url).content)
    soup = bs(raw)
    return soup.select("#MainTxt")[0].select('.ds-single')[0].text.strip()
Beispiel #40
0
from os import getcwd
from os.path import join
from bs4 import BeautifulSoup as bs
import requests as req
from splinter import Browser
import pandas as pd

# In[ ]:

#MARS NEWS:

# In[186]:

url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
response = req.get(url)
soup = bs(response.text, 'html.parser')

# In[187]:

results = soup.find_all('div', class_="image_and_description_container")

# In[188]:

counter = 0
for result in results:
    if counter == 0:
        try:
            news_p = result.find('div',
                                 class_="rollover_description_inner").text
            if (news_p):
                print(news_p)
Beispiel #41
0
def post_to_slack(url, r):
    payload = u'Noe nytt har skjedd på Blank: <' + url + '>'
    slack_data = {'text': payload}
    response = r.post(
        WEBHOOK_URL, data=json.dumps(slack_data),
        headers={'Content-Type': 'application/json'}
    )
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text)
        )

page = r.get('https://blank.p3.no')
content = page.content
soup = bs(content, "html.parser")
articles = soup.find_all('article')
bylines = [byline.text.replace('\n','') for byline in soup.select('.byline')]
link = articles[0].find_all('a', href=True)[0]['href']

cursor.execute("SELECT * FROM updates ORDER BY ID DESC")
records = cursor.fetchall()

if not records:
    cursor.execute("INSERT INTO updates (title) VALUES (%s) ON CONFLICT DO NOTHING", ["Blankbot ass."])
    conn.commit()
    print("Måtte lissom legge noe i tabellen a")
if records:
    if records[0][1] != link:
        post_to_slack(link, r)
        cursor.execute("INSERT INTO updates (title) VALUES (%s)", [link])
def treatment():
    with open('source_code.txt', 'r') as f:
        data = f.read()

    cont = bs(data, 'lxml')

    dias = []

    first_content = cont.find(
        'p',
        class_='TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text')

    dias.append(first_content)

    contents = cont.findAll('p',
                            class_='TweetTextSize js-tweet-text tweet-text')

    dias.extend(contents)

    index = dias[0].text.find("F")

    lista = []

    lista.append(dias[0].text[21:])

    for i in dias[1:]:
        lista.append(i.text[31:])

    start = lista[1].find(':')
    end = lista[1].find('\n')
    '''
    - Flexões
    - Abdominais
    - Mergulhos
    - Agachamentos
    - Prancha
    '''
    flex = []
    abdo = []
    merg = []
    agac = []
    prancha = []

    for i in lista:
        flex.append(int(i[start + 1:end].strip()))

    start = lista[1].find(':', start + 1)
    end = lista[1].find('\n', end + 1)

    for i in lista:
        abdo.append(int(i[start + 1:end].strip()))

    start = lista[1].find(':', start + 1)
    end = lista[1].find('\n', end + 1)

    for i in lista:
        merg.append(int(i[start + 1:end].strip()))

    start = lista[1].find(':', start + 1)
    end = lista[1].find('\n', end + 1)

    for i in lista:
        agac.append(int(i[start + 1:end + 1].strip()))

    start = lista[1].find(':', start + 1)
    end = lista[1].find('\n', end + 1)

    for i in lista:
        res = str(i[start + 2:].strip())
        prancha.append(int(res[:-1]))

    return flex, abdo, merg, agac, prancha
          5. Start Date/Time(Enter date and time in different columns)
          6. End Date/Time(Enter date and time in different columns)

Store the information into a database mySQL Database on cloud 

"""
bid_no=[]
item_name=[]
department_name=[]
date=[]
import mysql.connector 
from bs4 import BeautifulSoup as bs
import requests
url = "https://bidplus.gem.gov.in/bidlists"
source = requests.get(url).text
soup=bs(source,"lxml")
soup.prettify()
html_data=soup.findAll('div', class_='border block')

for row in html_data:
    cells = row.find('div', class_='block_header') 
    # first row has 7 TH 
    detail = row.findAll('div', class_='col-block')
    bid_no.append(cells.p.text.strip(""))   
    item_name.append(detail[0].p.text.strip(""))
    department_name.append(detail[1].text.strip("\n"))
    date.append(detail[2].text.strip("\n"))
    
    
    
conn = mysql.connector.connect(user='******', password='******',
Beispiel #44
0
def scrape():
    final_dict = {}

    # Finding Most recent title and summary

    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    # list_text = soup.find('div', class_='list_text')
    # news_title = list_text.find('div', class_='content_title')
    # final_dict['title'] = news_title
    # final_dict['text'] = list_text

    news_title = soup.find('div', class_='content_title').text
    news_title.replace("\n", "")
    final_dict['news_title'] = news_title

    list_text = soup.find('div', class_='rollover_description').text
    list_text.replace("\n", "")
    final_dict['list_text'] = list_text
    # Find the spaceimage using splinter

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    html = browser.html
    soup = bs(html, 'html.parser')
    soup

    feat_img = soup.find("div", class_='carousel_items').find(
        "article", class_='carousel_item').find("a")['data-fancybox-href']
    full_url = f'https://www.jpl.nasa.gov{feat_img}'
    final_dict['feat_img'] = full_url
    # close browser
    browser.quit()

    # Scraping twitter

    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'lxml')
    tweet = soup.find(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    final_dict['tweet'] = tweet

    # Scraping Mars Facts using pandas
    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    mars_facts = tables[0]
    mars_facts.to_html('mars_facts.html')
    final_dict['mars_fact'] = mars_facts.to_html()

    # Scraping Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    title = []
    hemi = soup.find_all('div', class_='description')
    hemi = soup.find_all('h3')
    hemi[0].text.replace('Enhanced', '')
    for i in hemi:
        title.append(i.text.replace('Enhanced', ''))

    new_url = []
    for a in soup.find_all('a', class_='itemLink product-item', href=True):
        new_url.append(a['href'])

    hemi_url = 'https://astrogeology.usgs.gov/'
    imgs = []
    for i in new_url:
        response = requests.get(f'{hemi_url}{i}')
        soup = bs(response.text, 'lxml')
        finder = soup.find('img', class_='wide-image')
        full_url_img = f'{hemi_url}{finder["src"]}'
        imgs.append(full_url_img)

    hemi_list = []
    hemi_dict = {}
    y = 0
    for i in title:
        hemi_dict = {'title': i, 'images_url': imgs[y]}
        hemi_list.append(hemi_dict)
        y += 1

    final_dict['hemi_list'] = hemi_list

    return final_dict
from urllib.request import *
from bs4 import BeautifulSoup as bs

response = urlopen("https://imdb.com")
html = bs(response, 'lxml')
images = html.find_all('img')
# print(len(images))
for image in images:
    # print(image['src'])
    pass

for i in range(len(images)):
    url = images[i]['src']
    fileName = f"{i + 1}.{url[-3:]}"
    urlretrieve(url, fileName)
Beispiel #46
0
def write_coord_csv(html):

    seen = set()

    file = open("out.csv", "w")
    file.write("text_content, top_left, top_right, bottom_left, bottom_right")
    file.write("\n")

    driver = webdriver.Chrome()
    driver.get("data:text/html;charset=utf-8," + html)

    # annotate xpos and ypos for images
    for img_sel_elem in driver.find_elements_by_tag_name("img"):
        center = (img_sel_elem.location["x"] + img_sel_elem.size["width"] / 2,
                  img_sel_elem.location["y"] + img_sel_elem.size["height"] / 2)
        driver.execute_script(
            "arguments[0].setAttribute('xpos','%s')" % str(int(center[0])),
            img_sel_elem)
        driver.execute_script(
            "arguments[0].setAttribute('ypos','%s')" % str(int(center[1])),
            img_sel_elem)

    # remove all scripts
    soup = bs(driver.page_source, "html5lib")
    [x.extract() for x in soup.findAll('script')]

    # remove all comments
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]

    e = etree.HTML(str(soup))
    tree = etree.ElementTree(e)
    text_elements = [
        element for element in e.getiterator()
        if element.text and len(element.text) > 1
    ]

    # extract all text and annotate xpos and ypos for text
    for elem in text_elements:
        xpath = tree.getpath(elem)
        text_content = elem.text
        if xpath not in seen:
            seen.add(xpath)
            element = driver.find_element_by_xpath(xpath)
            area = element.size["width"] * element.size["height"]
            if area > 0:
                text_content = "%s" % element.text
                text_content.replace("\n",
                                     " ").replace("\r",
                                                  " ").replace("\"", "&quot")

                if len(text_content) > 1:
                    top_left = (element.location["x"], element.location["y"])
                    top_right = (element.location["x"] + element.size["width"],
                                 element.location["y"])
                    bottom_left = (element.location["x"],
                                   element.location["y"] +
                                   element.size["height"])
                    bottom_right = (element.location["x"] +
                                    element.size["width"],
                                    element.location["y"] +
                                    element.size["height"])

                    center = (element.location["x"] +
                              element.size["width"] / 2,
                              element.location["y"] +
                              element.size["height"] / 2)

                    elem.set("xpos", str(int(center[0])))
                    elem.set("ypos", str(int(center[1])))

                    line = "\"{0}\",\"{1}\",\"{2}\",\"{3}\",\"{4}\"".format(
                        text_content, top_left, top_right, bottom_left,
                        bottom_right)

                    seen.add(text_content)

                    file.write(line)
                    file.write("\n")

    new_html = open("out.html", "w")
    new_html.write(tostring(e))
    new_html.close()
Beispiel #47
0
        time.sleep(delay)
        while True:
            try:
                driver.find_element_by_xpath(
                    "//button[@ng-click='vm.pagginator.showmorepage()']"
                ).click()
                time.sleep(2)
                # if DEBUG:
                #     print("Clicked Successfully")
            except Exception as e:
                # if DEBUG:
                #     print(e)
                break
        html = driver.execute_script(
            "return document.documentElement.outerHTML")
        soup = bs(html, 'html.parser')
        products = soup.findAll("div", {"qa": "product"})

        rel_url = re.sub(r"/?.*", "", url)
        rel_url = rel_url.lstrip('https://www.bigbasket.com/pc/')

        ds_img = os.path.join(OUTPUT_DIR, 'images', 'large')
        dl_img = os.path.join(OUTPUT_DIR, 'images', 'small')

        if not os.path.exists(ds_img):
            os.makedirs(ds_img)
        if not os.path.exists(dl_img):
            os.makedirs(dl_img)

        for product in products:
            get_product_data(product, raw_data_file, url)
Beispiel #48
0
    c_idpass = list(csv.reader(f_idpass))

    minute_limit = 0

    for idpass in c_idpass:

        try:
            un = idpass[0].strip()
            pw = idpass[1].strip()

            # un = l[(2*i)-2]
            # ps = l[(2*i)-1]

            with Session() as s:
                site = s.get("http://10.220.20.12/index.php/home/loginProcess")
                bs_content = bs(site.content, "html.parser")
                login_data = {"username": un, "password": pw}
                s.post("http://10.220.20.12/index.php/home/loginProcess",
                       login_data)
                home_page = s.get(
                    "http://10.220.20.12/index.php/home/dashboard")
                soup = bs(home_page.content, "lxml")

                table = soup.table

                c = 1
                li = []

                try:
                    table_rows = table.find_all('tr')
                    for tr in table_rows:
from bs4 import BeautifulSoup as bs
from urllib import request as req
import pandas as pd

# Change the number of pages to the page until you want the data 
number_of_pages = 1 

l_books = {
    "name": [],
    "author": []
}

for i in range(1, number_of_pages+1):
    url = req.urlopen('https://archive.org/details/internetarchivebooks?&sort=-downloads&page=%d' % i)
    soup = bs(url, 'html5lib')

    titles = []
    titles = [title for title in soup.find_all("div", {"class" : "ttl"})]

    for t in titles:
        try:
            fill_text = ""
            # print(t.parent.parent.find_next_siblings("div")[1].contents[3].text)
            fill_text = t.parent.parent.find_next_siblings("div")[1].contents[3].text
        except Exception:
            fill_text = " "
        finally:
            l_books["name"].append(' '.join(t.text.split()))
            l_books["author"].append(' '.join(fill_text.split()))
            # l_books.append({str(' '.join(t.text.split())) : str(' '.join(fill_text.split()))})
            # books[' '.join(t.text.split())] = ' '.join(fill_text.split())
def scrape_info():
    # adding the dictionary from where all the data is going to be shown
    final_dictionary = {}
    # calling the URL
    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    
    # calling the executable path for Chrome extention
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    browser.visit(url)

    time.sleep(5)
    

    # getting the html file of the browser 

    html = browser.html
    soup = bs(html,'html.parser')
    # creating a variable to store the article selected
    one_article = soup.find('li', class_='slide')



    # getting the text and the information required for the data

    title_news = one_article.find(class_="content_title").text
    paragraph = one_article.find(class_="article_teaser_body").text
    # print(paragraph)
    # appending to the final dictionary as with the key latest_news
    final_dictionary['latest_news'] = [title_news,paragraph]



    browser.quit()

    # --------------------------------------------
    # --------------------------------------------

    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url)

    button = browser.find_by_css('.button').first.click()

    featured_img = browser.find_by_css('.fancybox-image')
    # html = featured_img.html
    # soup = bs(html,'html.parser')
    time.sleep(3)
    # print(featured_img['src'])


    # featured_img = browser.find_by_tag('img') img[src]')['src']
    # featured_img_2 = featured_img.find('img')['src']
    # featured_img_3 = featured_img_2['src']
    featured_img_url = featured_img['src']

    final_dictionary['Featured_image']=featured_img_url

    browser.quit()

    # this part of the code deals with twitter having different versions when oppen in the browser
    # it will run a loop ultil the rigth version is open.
    # it will scrape what it needs and get out of the loop

    flag = False
    while flag == False:
        try:
            url = "https://twitter.com/marswxreport?lang=en"
            executable_path = {'executable_path': 'chromedriver.exe'}
            browser = Browser('chrome', **executable_path, headless=False)
            browser.visit(url)
            time.sleep(5)
            html = browser.html
            soup = bs(html,'html.parser')
            mars_weather = soup.find('p',class_='tweet-text').text

            final_dictionary['Mars_weather']=mars_weather
            browser.quit()
        
            # print(final_dictionary)
            flag = True
        except:
            print('Wrong twitter version trying again')
            flag = False
            browser.quit()



    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    table_1 = tables[0]

    table_1.set_index(0, inplace=True)
    table_1_html = table_1.to_html().replace('\n', '')

    final_dictionary['Facts_table'] = table_1_html


    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    browser.visit(url)
# -----------------------------------------------------
# -----------------------------------------------------

    hemisphere_image_urls = []
    for x in range(4):
    # print(x)
        button = browser.find_by_css('h3')[x].click()
        html = browser.html
        soup = bs(html,'html.parser')
        image = soup.find('div', class_='downloads')
        image = image.find('a')['href']
        title = soup.find('h2', class_='title').text
        hemisphere_image_urls.append({'title': title, 'img_url': image})
        browser.back()
        time.sleep(5)

    final_dictionary['hemisfere_images'] = hemisphere_image_urls
    browser.quit()
    # return final_dictionary
        
    return final_dictionary

    
# print(scrape_info())
    
    
    
    
    
    
Beispiel #51
0
def indices(category):
    cat = {
        "market_cap/broad": "1,2",
        "sector_and_industry": "2,2",
        "thematics": "3,2",
        "strategy": "4,2",
        "sustainability": "5,2",
        "volatility": "6,1",
        "composite": "7,1",
        "government": "8,1",
        "corporate": "9,1",
        "money_market": "10,1"
    }
    try:
        ddl_category = cat[category]
    except KeyError:
        print('''
### Invalid category ###
Use one of the categories mentioned below:

market_cap/broad
sector_and_industry
thematics
strategy
sustainability
volatility
composite
government
corporate
money_market
        ''')
        return
    baseurl = '''https://m.bseindia.com/IndicesView_New.aspx'''
    res = requests.get(baseurl, headers=headers)
    c = res.content
    soup = bs(c, "lxml")
    options = {
        '__EVENTTARGET': 'ddl_Category',
        '__VIEWSTATEENCRYPTED': '',
        '__EVENTARGUMENT': '',
        '__LASTFOCUS': '',
        '__VIEWSTATEGENERATOR': '162C96CD',
        'UcHeaderMenu1$txtGetQuote': ''
    }
    for input in soup("input"):
        try:
            if(input['type'] == "hidden"):
                if(input['id'] == '__VIEWSTATE'):
                    options['__VIEWSTATE'] = input['value']
                elif(input['id'] == '__EVENTVALIDATION'):
                    options['__EVENTVALIDATION'] = input['value']
        except KeyError:
            continue
    options['ddl_Category'] = ddl_category
    res = requests.post(url=baseurl, data=options)
    c = res.content
    soup = bs(c, "lxml")
    indices = []
    for td in soup('td'):
        try:
            if(td['class'][0] == 'TTRow_left'):
                index = {}
                index['currentValue'] = td.next_sibling.string.strip()
                index['change'] = td.next_sibling.next_sibling.string.strip()
                index['pChange'] = td.next_sibling.next_sibling.next_sibling.string.strip()
                index['scripFlag'] = td.a['href'].strip().split('=')[1]
                index['name'] = td.a.string.strip().replace(';', '')
                indices.append(index)
        except KeyError:
            continue
    results = {}
    for span in soup("span", id="inddate"):
        results['updatedOn'] = span.string[6:].strip()
    results['indices'] = indices
    return results
def scrape():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    mars_data = {}

    # NASA Mars News
    url = 'https://redplanetscience.com/'
    browser.visit(url)

    html = browser.html

    soup = bs(html, 'html.parser')

    # news_title = soup.find('div', class_='content_title').text
    # news_p = soup.find('div', class_='article_teaser_body').text

    mars_data['news_title'] = soup.find('div', class_='content_title').text
    mars_data['news_p'] = soup.find('div', class_='article_teaser_body').text

    # JPL Mars Space Images
    image_url = 'https://spaceimages-mars.com/'
    browser.visit(image_url)
    # HTML Object
    html = browser.html

    # Parse HTML with Beautiful Soup
    soup = bs(html, 'html.parser')

    featured_image_url = soup.find('img', class_='headerimage fade-in')

    #featured_image_url = image_url + featured_image_url['src']

    mars_data['featured_image_url'] = image_url + featured_image_url['src']

    ## Mars Facts
    url = 'https://galaxyfacts-mars.com/'
    tables = pd.read_html(url)
    df = tables[0]
    new_header = df.iloc[0]
    df = df[1:]
    df.columns = new_header
    df.set_index('Mars - Earth Comparison', inplace=True)

    html_table = df.to_html()

    mars_data['table'] = html_table

    ## Mars Hemispheres
    url = 'https://marshemispheres.com/'
    browser.visit(url)
    hemisphere_image_urls = []
    hem_url = browser.find_element_by_css_selector(
        'a', class_='itemLink product-item')

    for item in range(len(hem_url)):
        hemisphere = {}
        browser.find_by_css("a.product-item h3").click()
        mars_data["hem_title"] = browser.find_by_css("h2.title").text

        sample_element = browser.find_link_by_text("Sample").first
        mars_data["hem_img_url"] = sample_element["href"]

        # Append Hemisphere Object to List
        hemisphere_image_urls.append(hemisphere)

        # Navigate Backwards
        browser.back()

    # Quit the browser
    browser.quit()

    return mars_data
import requests
import urllib.request
from bs4 import BeautifulSoup as bs
import csv
import pandas

# Set the URL you want to webscrape from
url = 'http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases'

# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object¶
soup = bs(response.text, "html.parser")
li = soup.findAll('li', 'flex-item')
a = [i.find('a') for i in li]
name = [name.get_text(strip=True) for name in a]
img = ['http://www.agriculture.gov.au' + img.find('img')['src'] for img in a]
href = [href['href'] for href in a]
origin = []
pest_type = []
au_legal = []
for i in href:
    x = i.split('/')
    if '' == x[-1]: del x[-1]
    if '/pests/' in i:
        pest_type.append(x[-1])
    else:
        pest_type.append('-')
    origin.append(x[-2])
    if '.au' in i:
        au_legal.append('Yes')
Beispiel #54
0
from bs4 import BeautifulSoup as bs
from db import Database

site_url = 'https://www.edimdoma.ru'
page_no = 1

db = Database()
if not os.path.exists('img'):
    os.mkdir('img')

while True:

    url = '{site_url}/retsepty?page={page_no}'.format(site_url=site_url,
                                                      page_no=page_no)
    page = requests.get(url)
    html = bs(page.content, 'html.parser')

    for card in html.select('.card'):

        try:
            card_url = card.find('a').attrs['href']
            recipe_url = site_url + card_url
        except:
            continue

        card_page = requests.get(recipe_url)
        card_html = bs(card_page.content, 'html.parser')

        imgs = card_html.findAll("div",
                                 {"class": "thumb-slider__image-container"})
        if imgs:
    cookie_jar = get_cookies(keys['id'], keys['pw'])
    for cookie in cookie_jar:
        s.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'])
    
    # 1페이지부터 5페이지 내 게시글 URL 가져오기
    url_list = []
    for pageno in range(1, 6):
        print('Page: ', pageno)
        res = s.get(
            'http://cafe.naver.com/ArticleList.nhn?search.clubid={clubid}&search.menuid={menuid}&search.boardtype=L&search.page={pageno}'.format(
                clubid='28385054',
                menuid='53',
                pageno=pageno
            )
        )
        soup = bs(res.text, 'lxml')
        article_link_list = soup.select('td.board-list span > a')
        for article in article_link_list:
            article_url = article['href']
            url_list.append(article_url)
        print('URL counter: ', len(url_list))

    # 중복 URL 거르기
    url_list = set(url_list)
    print('전체 URL개수: ', len(url_list))

    # 앞서 가져온 URL 내용 가져오기 (제목, 본문)
    contents_list = []
    for url in url_list:
        url = 'http://cafe.naver.com' + url
        res2 = s.get(url)
Beispiel #56
0
def scrape_info():
    # Set executable path and browser for splinter
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Save url as a variable
    url = "https://mars.nasa.gov/news/"

    # Visit url using splinter
    browser.visit(url)

    # Use sleep function in time module to wait for page to fully load
    time.sleep(10)

    # Save browser contents in html as variable
    page = browser.html

    # Create and parse BeautifulSoup object
    soup = bs(page, "html.parser")

    # Collect latest news title
    news_title = soup.find("div", class_="content_title").text

    # Strip whitespace
    news_title = news_title.strip()

    # Collect corresponding paragraph text
    news_p = soup.find("div", class_="rollover_description_inner").text

    #Strip whitespace
    news_p = news_p.strip()

    # Visit url using splinter
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    # Find featured image
    featured_img = browser.find_by_css("article.carousel_item").first

    # Set base url to concatenate with image url
    base_url = "https://www.jpl.nasa.gov"

    # Select image reference from featured_img
    featured_image_url = featured_img["style"]

    # Split with " delimiter to remove extra text
    featured_image_url = featured_image_url.split('"')[1]

    # Concatenate with base url to get full url
    featured_image_url = f"{base_url}{featured_image_url}"

    # Save url as a variable
    url = "https://twitter.com/marswxreport?lang=en"

    # Visit url using splinter
    browser.visit(url)

    # Save html contents as a variable
    page = browser.html

    # Create and parse BeautifulSoup object
    soup = bs(page, "html.parser")

    # Find latest mars weather and save as a variable
    # Link text needs to be removed, so do not include .text in find
    mars_weather = soup.find("p", class_="TweetTextSize")

    # Find unwanted link text
    remove_link = soup.find("a", class_="twitter-timeline-link u-hidden")

    # Extract link text from mars_weather
    remove_link.extract()

    # Convert to text and replace line breaks with spaces
    mars_weather = mars_weather.text
    mars_weather = mars_weather.replace("\n", " ")

    # Quit browser session
    browser.quit()

    # Use pandas to scrape Mars facts website
    url = "https://space-facts.com/mars/"
    tables = pd.read_html(url)[1]

    # Format dataframe
    tables = tables.rename(columns={0: "Description", 1: "Value"})
    tables = tables.set_index("Description")

    # Convert to html
    html_table = tables.to_html()
    html_table

    # Save url as a variable
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    # Retrieve page with requests
    response = requests.get(url)

    # Create and parse BeautifulSoup object
    soup = bs(response.text, "html.parser")

    # Find all divs containing hemisphere image links
    link_list = soup.find_all("a", class_="itemLink")

    # Set base url used to create links
    base_url = "https://astropedia.astrogeology.usgs.gov/download"

    # Extract links from divs
    cerberus = f"{base_url}{link_list[0]['href'].replace('/search/map', '')}.tif/full.jpg"
    schiaparelli = f"{base_url}{link_list[1]['href'].replace('/search/map', '')}.tif/full.jpg"
    syrtis = f"{base_url}{link_list[2]['href'].replace('/search/map', '')}.tif/full.jpg"
    valles = f"{base_url}{link_list[3]['href'].replace('/search/map', '')}.tif/full.jpg"

    links = [cerberus, schiaparelli, syrtis, valles]
    titles = []

    # Loop through soup results to get titles for each hemishphere
    for item in link_list:
        title = item.find("img")
        title = title["alt"]
        title = title.replace(" Enhanced thumbnail", "")
        titles.append(title)

    # Create empty list to store dictionaries for each title and url
    hemisphere_image_urls = []

    # Loop through links and titles and add dictionaries to list
    for item in range(len(links)):
        hemisphere_image_urls.append({
            "title": titles[item],
            "url": links[item]
        })

    # Create dictionary for all Mars data
    mars_data = {
        "headline": news_title,
        "subhead": news_p,
        "featured": featured_image_url,
        "weather": mars_weather,
        "table": html_table,
        "hemispheres": hemisphere_image_urls
    }

    # Return results in a single dictionary
    return mars_data
Beispiel #57
0
 def find_folder_table(self,html):
     three = bs(html, "html.parser")
     folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
     return folders
            # accessing website data via soup object and
            # feeding the data as msg to our Notification
            # A string corresponds to a bit of text "within a html tag" , and
            # The split() method splits a string into a list. You can specify the separator,
            # default separator is any whitespace.
            # Beautiful Soup converts html doc into a tree data structure and parses it

            # Sending http Requests


        htmldata = getdata("https://covid-19tracker.milkeninstitute.org/")
        html = getdata("https://www.worldometers.info/coronavirus/country/india")
        html_gov = getdata("https://www.mohfw.gov.in/")

        # creation of soup objects for Web Scrapping
        soup1 = bs(htmldata, "html.parser")
        soup2 = bs(html, "html.parser")
        soup3 = bs(html_gov, "html.parser")

        # WEB SCRAPPING : to find Vaccine Names

        # Some Prints to Check the location in result str containing the Vaccine Names
        # print (result[46:86])
        # printing vaccine names to the console
        # WEB SCRAPPING : to find the No. of Deaths and New Cases
        cases = list(soup2.find("li", {"class": "news_li"}))[0].text.split()[0]
        deaths = list(soup2.find("li", {"class": "news_li"}))[2].text.split()[0]
        # print (deaths )

        # WEB SCRAPPING : to fetch total and recovered cases and recovery %
        tot_cases = list(soup2.find("div", {"class": "maincounter-number"}))[1].text.split()[0]
def scrape():
    browser = init_browser()
    nasa_url = "https://mars.nasa.gov/news/"
    browser.visit(nasa_url)
    time.sleep(1)

    html = browser.html
    news_soup = bs(html, "html.parser")

    title = news_soup.find("div", class_="content_title").text
    article_summary = news_soup.find("div", class_="article_teaser_body").text

    print(f"Article Title: {title}")
    print(f"Summary: {article_summary}")

    image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(image_url)
    time.sleep(1)

    browser.find_by_css('a.button').click()

    image_soup = bs(browser.html, 'html.parser')
    end = image_soup.find('img', class_='fancybox-image')['src']
    JPL_image = "https://www.jpl.nasa.gov" + end

    print(JPL_image)

    weather_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(weather_url)
    time.sleep(1)

    html = browser.html
    weather_soup = bs(html, "html.parser")
    tweet = weather_soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text

    print(tweet)

    fact_url = "http://space-facts.com/mars/"
    browser.visit(fact_url)
    time.sleep(1)

    html = browser.html
    fact_soup = bs(html, "html.parser")

    table = pd.read_html(fact_url)
    table[0]

    df_mars_facts = table[0]
    df_mars_facts

    fact_html = df_mars_facts.to_html()
    fact_html = fact_html.replace("\n", "")
    fact_html

    hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemi_url)
    time.sleep(1)

    html = browser.html
    hemi_soup = bs(html, "html.parser")
    headers = []
    titles = hemi_soup.find_all('h3')

    for title in titles:
        headers.append(title.text)

    images = []
    count = 0

    for thumb in headers:
        browser.find_by_css('img.thumb')[count].click()
        images.append(browser.find_by_text('Sample')['href'])
        browser.back()
        count = count + 1

    hemisphere_image_urls = []
    counter = 0

    for item in images:
        hemisphere_image_urls.append({
            "title": headers[counter],
            "img_url": images[counter]
        })
        counter = counter + 1
        data = {
            "News_Header": title,
            "News_Article": article_summary,
            "JPL_Image": JPL_image,
            "Weather": tweet,
            "Facts": fact_html,
            "Hemispheres": hemisphere_image_urls
        }
        return data
def scrape():
    browser = init_browser()

    # Visit the Nasa site
    news_url = 'https://mars.nasa.gov/news/'
    # Retrieve page with the requests module
    response = requests.get(news_url)
    # Create BeautifulSoup object; parse with 'lxml'
    soup = bs(response.text, 'html.parser')

    # Extract the title of the news article
    title = soup.find('div', class_="content_title").text.strip()

    # Extract the teaser paragraph about the news article
    paragraph = soup.find(
        'div', class_="image_and_description_container").text.strip()

    # visit the Nasa Images site
    nasa_images_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(nasa_images_url)

    # Extract the url of the featured image
    image_html = browser.html
    soup = bs(image_html, 'html.parser')

    article = soup.find('a', class_='button fancybox')
    href = article['data-fancybox-href']
    featured_image_url = "https://www.jpl.nasa.gov" + href

    # Visit the Mars Weather Twitter page
    weather_url = 'https://twitter.com/marswxreport?lang=en'
    from selenium import webdriver
    driver = webdriver.Chrome()
    driver.get(weather_url)
    html = driver.page_source
    driver.close()

    # Extract the current weather on Mars
    weather_html = browser.html
    soup = bs(html, 'html.parser')
    mars_weather = soup.find(
        'div',
        class_=
        "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0"
    ).text.strip()

    # Visit the Space Facts page about Mars
    facts_url = 'https://space-facts.com/mars/'
    browser.visit(facts_url)

    # Extract the Mars Facts table as a Pandas dataframe
    table = pd.read_html(facts_url)
    profile = table[0]
    profile_df = profile.rename(columns={0: 'Description', 1: 'Value'})
    facts = []
    for index, row in profile_df.iterrows():
        desc = row['Description']
        value = row['Value']
        fact = {'description': desc, 'value': value}
        facts.append(fact)

    # Visit the USGS Astrogeology site
    hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemisphere_url)

    # Extract the name of each of Mars's hemispheres and the url of the image of that hemisphere, then insert into MongoDB
    hemisphere_html = browser.html
    soup = bs(hemisphere_html, 'html.parser')

    results = soup.find_all('div', class_="item")

    hemisphere_image_urls = []

    for result in results:
        heading = result.find('h3').text.replace('Enhanced', '')
        link = result.find('a')['href']
        url = "https://astrogeology.usgs.gov" + link
        browser.visit(url)
        image_html = browser.html
        soup = bs(image_html, 'html.parser')
        img_url = soup.find('div', class_="downloads").find('a')['href']
        print(heading)
        print(img_url)
        hemisphere = {'title': heading, 'img_url': img_url}
        hemisphere_image_urls.append(hemisphere)

    mars_data = {
        "news_title": title,
        "news_paragraph": paragraph,
        "featured_image": featured_image_url,
        "mars_weather": mars_weather,
        "mars_facts": facts,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    browser.quit()

    return mars_data