Example #1
0
def go_desc():
    BASE_URL = 'http://www.svatibor.ru/internet_magazin/product/'
    import urllib2
    from bs4 import BeautifulSoup
    for i in Item.objects.all():
        print BASE_URL + str(i.product_id)
        c = urllib2.urlopen(BASE_URL + str(i.product_id))
        soup = BeautifulSoup(c.read())
        i.text = str(soup.findAll('div', attrs={'class' : 'full'})[0]).replace('\n', '<br />')
        i.stock=100
        i.art = soup.findAll('div', attrs={'id' : 'tovar_card'})[0].findAll('ul', attrs={'id' : 'p_list'})[0].findAll('span')[0].string
        i.is_novelty = len(soup.findAll('li', attrs={'class' : 'new'})) > 0
        image = 'http://www.svatibor.ru' + soup.findAll('div', attrs={'id' : 'tovar_card'})[0].findAll('a')[0]['href']
        if image.endswith('.jpg'):
            f = open('media/uploads/items/%s.jpg' % i.id,'wb')
            f.write(urllib2.urlopen(image).read())
            f.close()
            i.image = 'uploads/items/%s.jpg' % i.id
        elif image.endswith('.png'):
            f = open('media/uploads/items/%s.png' % i.id,'wb')
            f.write(urllib2.urlopen(image).read())
            f.close()
            i.image = 'uploads/items/%s.png' % i.id
            print image
        i.save()
Example #2
0
def sanitize_html(value, valid_tags=VALID_TAGS):
    soup = BeautifulSoup(value)
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]
    # Some markup can be crafted to slip through BeautifulSoup's parser, so
    # we run this repeatedly until it generates the same output twice.
    newoutput = soup.renderContents()
    while 1:
        oldoutput = newoutput
        soup = BeautifulSoup(newoutput)
        for tag in soup.findAll(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            else:
                print tag.name,'***', tag.attrs
                print tag.name,'###', [x for x in tag.attrs ]
                m={}
                for k in tag.attrs.keys():
                    if k  in valid_tags[tag.name]:
                        m[k] = tag.attrs[k]
                tag.attrs = m
                print tag.name,'===', m
                #tag.attrs = [(attr, value) for attr, value in tag.attrs if attr in valid_tags[tag.name]]
        newoutput = soup.renderContents()
        if oldoutput == newoutput:
            break
    return newoutput
def scrape_song_metadata(soup, verbose = False):
    result = {}
    first_soup = soup.find("div", {"class":"song_header-primary_info"})
    first_soup = BeautifulSoup(soup.prettify(), "html.parser")

    artist = first_soup.find("a", {"class":"song_header-primary_info-primary_artist"})
    artist = clean_text(artist.string)
    if verbose:
        print "Artist : " + artist.encode('utf-8')
    result["artist"] = artist

    song = first_soup.find("h1", {"class":"song_header-primary_info-title"})
    song = clean_text(song.string)
    if verbose:
        print "Song   : " + song.encode('utf-8')
    result["song"] = song

    labels = first_soup.findAll("span", {"class":"song_info-label"})
    labels = [clean_text(l.string) for l in labels]
    contents = first_soup.findAll("span", {"class":"song_info-info"})
    contents = [BeautifulSoup(c.prettify(), "html.parser") for c in contents]
    contents = [c.a for c in contents]
    for i in range(len(labels)):
        if contents[i]:
            if verbose:
                print labels[i] + " :"
                print "    " + clean_text(contents[i].string).encode('utf-8')
                print "    " + contents[i]['href'].encode('utf-8')
            result[labels[i]] = {"name" : clean_text(contents[i].string), "link" : geniusify(contents[i]['href'])}

    return result
Example #4
0
    def extract_content(self, task, url, title, hashcode):
        c = download.Crawler(url) 
        d = c.get_data()
        if d is None:
            return None
 
        res = {
            'title': title.replace(u'【', u'[').replace(u'】', u']').replace(u'喷嚏图卦', u'小兔子图说'),
            'src': url,
            'account_id': task['account_id'],
            'category': task['category'],
            'hashcode': hashcode,
            'username': task['username'],
            'head_image': None,
        }
 
        d = d.decode('gbk', 'ignore').encode('utf8', 'ignore')
        start_pos = d.find(task['start_text'].encode('utf8'))
        end_pos = d.find(task['end_text'].encode('utf8'))
        soup = BeautifulSoup(d[start_pos:end_pos],"lxml")
        
        del_tags = ['script']
        for tag in del_tags: 
            for match in soup.findAll(tag):
                match.decompose()
    
        invalid_tags = ['a', 'b', 'i', 'u']
        for tag in invalid_tags: 
            for match in soup.findAll(tag):
                match.replaceWithChildren()
        self.extract_image(task, soup, res)
        message_queue.put(res, False)
Example #5
0
def parseresultpage(page, search, order, sort, regex):
    logger.info("    [+] Pulling results from page " + str(page))
    githubbase = "https://github.com/search?"
    githubsearchurl = {"o": order, "p": page, "q": search, "s": sort, "type": "Code", "ref": "searchresults"}
    searchurl = githubbase + str(urlencode(githubsearchurl))
    pagehtml = urlopen(searchurl).read()
    soup = BeautifulSoup(pagehtml, "html.parser")

    # Find GitHub div with code results
    results = soup.findAll("div", attrs={"class": "code-list-item"})

    # Pull url's from results and hit each of them
    soup1 = BeautifulSoup(str(results), "html.parser")
    for item in soup1.findAll("p", attrs={"class": "full-path"}):
        soup2 = BeautifulSoup(str(item), "html.parser")
        for link in soup2.findAll("a"):
            individualresult = "https://github.com" + str(link["href"])
            individualresultpage = urlopen(individualresult).read()
            soup3 = BeautifulSoup(str(individualresultpage), "html.parser")
            for rawlink in soup3.findAll("a", attrs={"id": "raw-url"}):
                rawurl = "https://github.com" + str(rawlink["href"])
                if args.custom_regex:
                    searchcode(rawurl, regex)
                else:
                    wpsearchcode(rawurl, regex)
def createSpreadsheet():
	  wb = xlwt.Workbook()
	  sheet = wb.add_sheet("Google Alerts")
	  style = xlwt.easyxf('font: bold 1')
	  sheet.write(0, 3, 'Headline', style)
	  sheet.write(0, 1, 'Company', style)
	  sheet.write(0, 4, 'URL', style)
	  sheet.write(0, 0, 'Date', style)

	  cur_row = 1

	  for url in LA_HONDA_ALERTS_URLS:
			print 'Processing google alerts for ' + LA_HONDA_ALERTS_URLS[url] + '...'
			r = requests.get(url)
			xml = r.text
			soup = BeautifulSoup(xml)

			for title, link, date in zip(soup.findAll('title')[1:], soup.findAll('link')[1:], soup.findAll('published')):
				  title = cleanTitle(title)
				  link = cleanLink(link)
				  date = cleanDate(date)

				  writeToSheet(sheet, title, LA_HONDA_ALERTS_URLS[url], link, date, cur_row)
				  cur_row = cur_row + 1

	  processSheet(sheet)
	  savewb(wb)
def search_handler():
    """Return DBLP Author Search results in JSON Format"""

    results = "5"
    author = None
    try:
        author = request.values['author']
    except:
        print "author parameter not found"


    url = "http://www.dblp.org/autocomplete-php/autocomplete/ajax.php?"\
    "query=%s&"\
    "name=dblpmirror&"\
    "path=/search/&"\
    "page=index.php&"\
    "log=/var/log/dblp/error.login&"\
    "qid=34&navigation_mode=user&"\
    "language=en&mcsr=40&mcc=0&mcl=80&"\
    "hppwt=20&hppoc=1000&eph=1&er=20&dm=3&"\
    "bnm=R&ll=2&mo=100&accc=:&syn=0&deb=0&"\
    "hrd=1a&hrw=1d&qi=1&fh=1&fhs=1&mcs=%s&"\
    "rid=44&qt=F" % (author, results)

    page = urlopen(url, data="Void").read()
    soup = BeautifulSoup(page)
    authors_span = soup.findAll("span", {"class": "\\\"completion\\\""})[:-1]
    pubs_count_span = soup.findAll(
            "span", {"class": "\\\"hits_number\\\""})[:-1]
    authors = [{'id':-1,'name' :a.string} for a in authors_span]
    pubs_count = [int(c.string[1:-1]) for c in pubs_count_span]
    response = {'authors': authors, 'pubs_count': pubs_count}
    json_response = jsonify(response)
    return json_response
Example #8
0
    def get_plugins(self, url):
        plugins = []
        headers = {'User-Agent': self.get_user_agent()}
        page_req = self.req.get(url, headers=headers)
        soup = BeautifulSoup(page_req.text, "html.parser")

        # Search pluginss in css
        plugin_paths = soup.findAll("link", {"rel": "stylesheet"})
        for plugin_path in plugin_paths:
            if 'wp-content/plugins/' in plugin_path['href']:
                regex = re.compile("wp-content/plugins/([a-zA-Z0-9-_]+)/",
                                   re.IGNORECASE)
                r = regex.findall(plugin_path['href'])
                for plugin_name in r:
                    plugins.append(plugin_name)

        # Search plugins in javascript
        plugin_paths = soup.findAll("script",
                                    {"type": "text/javascript"})
        for plugin_path in plugin_paths:
            try:
                if 'wp-content/plugins/' in plugin_path['src']:
                    regex = re.compile("wp-content/plugins/([a-zA-Z0-9-_]+)/",
                                       re.IGNORECASE)
                    r = regex.findall(plugin_path['src'])
                    for plugin_name in r:
                        plugins.append(plugin_name)
            except:
                # Silently pass, parsing html is pain in the ass
                pass

        return list(set(plugins))
def scrape_coins_ccc():
	print 'Scraping CCC'
	url = 'http://www.cryptocoincharts.info/coins/info/1001'
	html = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html, 'html.parser')
	trlist = soup.findAll('tr')
	coins = {}
	for i, tr in enumerate(trlist):
		print str(int(i+1))
		try:
			a = tr.find('td').a
			name = tr.findAll('td')[1].text
			coin_url = str('http://www.cryptocoincharts.info'+a['href'])
			coin_html = urllib2.urlopen(coin_url).read()
			coin_soup = BeautifulSoup(coin_html, 'html.parser')
			coin_table = coin_soup.findAll('table',{'class': 'table table-striped'})
			coin_tr = coin_table[1].find('tbody').findAll('tr')
			coin_markets = []
			for ctr in coin_tr:
				market_td = ctr.findAll('td')
				coin_markets.append({'market':market_td[0].text, 'pair':market_td[1].text, 'link': str('http://www.cryptocoincharts.info' + market_td[1].a['href'])})
			coins[a.text] = {'markets':coin_markets, 'name':name}
		except AttributeError:
			pass
	return coins
Example #10
0
def getkaomoji():
    headers = {"User-Agent": "007"}
    requests.get = functools.partial(requests.get, headers=headers)
    url = "http://dongerlist.com"
    r = requests.get(url)
    soup = BS(r.text, "html.parser")
    anchors = soup.findAll("a", attrs={"class": "list-2-anchor"})[1:]
    category_urls = {a.text.lower(): a["href"] for a in anchors}
    kaomoji = {category: [] for category in category_urls}
    kaomoji["all"] = {}
    for category, url in category_urls.items():
        print("Scraping %s ..." % url)
        page_number = 1
        while True:
            print("Is there a page %d? owo" % page_number)
            r = requests.get("%s/page/%d" % (url, page_number))
            if r.status_code == 200:
                print("Yeah! ;D")
                soup = BS(r.text, "html.parser")
                textareas = soup.findAll("textarea", attrs={"class": "donger"})
                kaomoji_list = [textarea.text for textarea in textareas]
                kaomoji[category].extend(kaomoji_list)
                kaomoji["all"].update({k: True for k in kaomoji_list})
            else:
                print("No. D;")
                break
            page_number += 1
    print("Success.")
    return kaomoji
Example #11
0
    def getStatusList(self, page):
        try:
            if(re.search('\<a href=status(\?top\=.+?)\>', page.text)):
                self.nextStatusUrl = self.statusUrl + str(re.findall('\<a href=status(\?top\=.+?)\>', page.text)[0])
            else: 
                return []
            soup = BeautifulSoup(page.text, "html.parser")
            table = soup.findAll('table', {'class':'a'})
            soup = BeautifulSoup(str(table), "html.parser")
            allStatus = soup.findAll('tr', {'align':'center'})
            statusList = []
            for status in allStatus:
                model = "(?<=\>)(.*?)(?=\<)"
                tempList = re.findall(model, str(status))
                resuList = []
                if(len(tempList) == 27):
                    for i in self.unusalList:
                        resuList.append(tempList[int(i)])
                else:
                    for i in self.usallyList:
                        resuList.append(tempList[int(i)])
                statusList.append(resuList)

            print("get statusList: " + str(len(statusList)) +" records")
            return statusList
        except:
            print("get statusList failure")
            return []
Example #12
0
    def search(self, domain):
        dnsdumpster_url = 'https://dnsdumpster.com/'
        s = requests.session()

        req = s.get(dnsdumpster_url)
        soup = BeautifulSoup(req.content, 'html.parser')
        csrf_middleware = soup.findAll('input', attrs={'name': 'csrfmiddlewaretoken'})[0]['value']
        self.display_message('Retrieved token: %s' % csrf_middleware)

        cookies = {'csrftoken': csrf_middleware}
        headers = {'Referer': dnsdumpster_url}
        data = {'csrfmiddlewaretoken': csrf_middleware, 'targetip': domain}
        req = s.post(dnsdumpster_url, cookies=cookies, data=data, headers=headers)

        if req.status_code != 200:
            print(
                u"Unexpected status code from {url}: {code}".format(
                    url=dnsdumpster_url, code=req.status_code),
                file=sys.stderr,
            )
            return []

        if 'error getting results' in req.content.decode('utf-8'):
            print("There was an error getting results", file=sys.stderr)
            return []

        soup = BeautifulSoup(req.content, 'html.parser')
        tables = soup.findAll('table')

        res = {'domain': domain, 'dns_records': {}}
        res['dns_records']['dns'] = self.retrieve_results(tables[0])
        res['dns_records']['mx'] = self.retrieve_results(tables[1])
        res['dns_records']['txt'] = self.retrieve_txt_record(tables[2])
        res['dns_records']['host'] = self.retrieve_results(tables[3])
        return res
Example #13
0
def google_image_results_parser(code):
    soup = BeautifulSoup(code)

    # initialize 2d array
    whole_array = {'links':[],
                   'description':[],
                   'title':[],
                   'result_qty':[]}

    # Links for all the search results
    for div in soup.findAll('div', attrs={'class':'g'}):
        sLink = div.find('a')
        whole_array['links'].append(sLink['href'])

    # Search Result Description
    for desc in soup.findAll('span', attrs={'class':'st'}):
        whole_array['description'].append(desc.get_text())

    # Search Result Title
    for title in soup.findAll('h3', attrs={'class':'r'}):
        whole_array['title'].append(title.get_text())

    # Number of results
    for result_qty in soup.findAll('div', attrs={'id':'resultStats'}):
        whole_array['result_qty'].append(result_qty.get_text())

    return build_json_return(whole_array)
Example #14
0
    def soupIt(self):
        http = urllib3.PoolManager()
        r = http.request("GET", self.url)
        soup = BeautifulSoup(r.data.decode('ISO-8859-1'), "lxml")
        self.title = soup.title.string

        # remove unused header parts
        # in comments because of firefox
        # for p in soup(["meta"]):
        #    p.extract()

        # remove comments
        for element in soup(text=lambda text: isinstance(text, Comment)):
            element.extract()

        # remove some images
        unused_images = soup.find_all('img', {'alt': 'bullet'}) \
                        + soup.find_all('img', {'src': '../../images/ilmulislam.gif'}) \
                        + soup.find_all('img', {'src': '../../images/enzykopf.gif'})
        for i in soup.find_all('img'):
            if i in unused_images:
                i.extract()

        # remove all links, but keep text
        # don't keep text for navigation links that don't lead to "begriffe" or "manuskripte"
        for l in soup.findAll('a'):
            if "begriffe" in urljoin(self.url, l['href']) or "manuskripte" in urljoin(self.url, l['href']):
                l.replaceWith(l.text)
            else:
                l.extract()

        # remove top blocks
        topBlocks = soup.findAll('td', {'width': '50%'})
        for block in topBlocks:
            if len(block.findChildren('img')):
                self.images += block.findChildren('img')
            block.extract()

        # remove trash tags and empty tags
        for tag in soup.findAll():

            if tag.name == "meta":
                continue
            if tag.name in ("td", "tr", "table", "center", "div", "font", "strong", "b"):
                tag.unwrap()
            if len(tag.text) == 0 or tag.text == '\n' or re.match(r'^\s*$',
                                                                  tag.text) or tag.is_empty_element or tag.isSelfClosing:
                tag.extract()

        for l in soup.find_all(text=re.compile('^\n')):
            l.extract()

        for l in soup.find_all(text=re.compile('\r\n')):
            l.replaceWith(" ")

        # append immages
        for i in self.images:
            soup.body.insert(0, i)

        return soup.prettify()
Example #15
0
def getGoogleLinks(url):
	url = url.replace(" ","+")
	url = "https://www.google.com/search?q="+url+"&num=100&filter=0"
	results_arr = []
	br = mechanize.Browser()
	br.set_handle_robots(False)
	br.addheaders = [('User-agent', 'Firefox')]

	htmltext = br.open(url).read()
	soup = BeautifulSoup(htmltext)
	searchres =  soup.findAll('div',attrs={"id":"search"})
	searchtext =  str(searchres[0])
	soup1 = BeautifulSoup(searchtext)
	lis = soup1.findAll("li")

	regex = "q(?!.*q).*?&amp"
	pattern = re.compile(regex)
	for li in lis:
		soup3 = BeautifulSoup(str(li))
		links = soup3.findAll('a')
		rep =  links[0]
		results = re.findall(pattern,str(rep))
		if len(results)>0:
			if "http" in str(results[0]):
				results_arr.append(str(results[0].replace("q=htt","htt").replace("&amp",""))) 
		#print links[0]
	return results_arr
Example #16
0
def getFaculty_det(reg_no = "", pwd = "", emp_id = ""):
	br = login(reg_no,pwd)

	print br.geturl()

	if br.geturl() == ("https://academics.vit.ac.in/student/stud_home.asp") or br.geturl() == ("https://academics.vit.ac.in/student/home.asp"):
		print "SUCCESS"

		br.open("https://academics.vit.ac.in/student/official_detail_view.asp?empid=%(id)s" % {"id" : emp_id })
		response = br.open("https://academics.vit.ac.in/student/official_detail_view.asp?empid=%(id)s" % {"id" : emp_id })

		soup = BeautifulSoup(response.get_data())

		img = soup.findAll('img')

		#fac_img = "https://academics.vit.ac.in/student/"+img[0]['src']+"?"

		tables = soup.findAll('table')
		myTable = tables[1]
		rows = myTable.findChildren(['th','tr'])
		rows = rows[1:10]
		data = []

		facDet = {}

		for row in rows:

			cells = row.findChildren('td')
			cells = cells[1]
			value = cells.string
			data.append(value)

		try:
			myTable = tables[2]

		except IndexError:
			facDet = {"name" : data[0], "school" : data[1], "destination" : data[1], "venue" : data[1], "intercom" : data[1], "email" : data[1], "division" : data[1], "additional_role" : data[1]}
			
		else:
			rows = myTable.findChildren(['th','tr'])
			rows = rows[1:4]
			openhr = []

			for row in rows:

				rowdata = []
				cells = row.findChildren('td')
				
				for cell in cells:
					value = cell.string
					rowdata.append(value)

				openhr.append(rowdata)
			facDet = {"name" : data[0], "school" : data[1], "destination" : data[1], "venue" : data[1], "intercom" : data[1], "email" : data[1], "division" : data[1], "additional_role" : data[1], "openhr_details" : openhr}

		return {"status" : "Success" ,"details" : facDet}

	else :
		print "FAIL"
		return {"status" : "Failure"}
Example #17
0
def exclaim_album_scrape(num_pages = 2, section_url = 'Album_EP/Page/'):
    
    BASE_URL = "http://exclaim.ca/music/Reviews/"
    linklist = []
    artistalbumlist = []
        
    for page in range(1,num_pages+1):
    	
        url = BASE_URL + section_url+str(page)
        req = urllib2.Request(url, headers=hdr)
        html = urllib2.urlopen(req).read()
        soup = BeautifulSoup(html, "lxml")

        heads = soup.findAll('h4')
        artists = [s.contents[0] for s in heads]
        albums = [s.contents[0].strip() for s in soup.findAll('span', {'class':'streamSingle-item-details'})]
        info = zip(artists, albums)
        artistalbumlist.extend(info)
        
        links = [s.findAll('a') for s in soup.findAll('ul', {'class':'streamSingle'})]
        links = [s['href'] for s in links[0] if s]
        if set(links)<= set(linklist):
            return artistalbumlist, linklist
        linklist.extend(links)

    return artistalbumlist, linklist
 def get_movie_list(self, winner = True): 
     
     if winner == True:
         url = "http://www.imdb.com/search/title?count=10000&groups=oscar_winners&title_type=feature&sort=year,desc&view=simple"
     else:
         url = "http://www.imdb.com/search/title?count=10000&groups=oscar_nominees&title_type=feature&sort=year,desc&view=simple"
     # url = "https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films"
 
     html = self.get_url(url)
     soup = BeautifulSoup(html, "lxml")  
     
     span_list = soup.findAll("span", { "class" : "lister-item-header" })
     title_list = soup.find_all(href=re.compile("/title/tt.*\?ref_=adv_li_tt"))
     year_list = soup.findAll("span", { "class" : "lister-item-year text-muted unbold" })
     
     oscar_list = list()
     for i in range(0, len(title_list)):
         
         title = re.sub('<.*?>', "", str(title_list[i]))
         year = re.sub('<.*?>', "", str(year_list[i]))
         year = re.sub('[a-zA-Z]|\s|\(|\)', '', year)
         
         d = {'title': title, 'year': year}
         oscar_list.append(d)
     
     if winner == True:
         self.winner_list = oscar_list
     else:
         self.nominated_list = oscar_list
Example #19
0
File: lyrics.py Project: 241n/beets
def scrape_lyrics_from_url(url):
    """Scrape lyrics from a URL. If no lyrics can be found, return None
    instead.
    """
    from bs4 import BeautifulSoup, Comment
    html = fetch_url(url)
    if not html:
        return None

    soup = BeautifulSoup(html)

    for tag in soup.findAll('br'):
        tag.replaceWith('\n')

    # Remove non relevant html parts
    [s.extract() for s in soup(['head', 'script'])]
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [s.extract() for s in comments]

    try:
        for tag in soup.findAll(True):
            tag.name = 'p'          # keep tag contents

    except Exception, e:
        log.debug('Error %s when replacing containing marker by p marker' % e,
                  exc_info=True)
Example #20
0
def scrape(url):
	home = 'http://www.moray.gov.uk/'
	datePattern = r'[0-9][0-9]-[0-9][0-9]-20[0-9][0-9]'	
	departments = r'(Chief Executive\'s Office|Corporate Services|Education and Social Care|Environmental Services|Multiple Services)'
	html = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html)
	links = soup.findAll('a', href=True)
	for l in links:
		if l.string is not None:
			#print l.string
			if re.search(departments, l.string) is not None:
				page = urllib2.urlopen(home+l['href']).read()
				pSoup = BeautifulSoup(page)
				pLinks = pSoup.findAll('a', href=True)
				for pl in pLinks:
					if pl.string is not None:
						try:
							if re.search(datePattern, pl.string):
								#print pl.string + ' : ' + pl['href']
								foi = urllib2.urlopen(home+pl['href']).read()
								foiSoup = BeautifulSoup(foi)
								bill = foiSoup.find('div', {'class': 'boxj_txt_ara'})
								if bill is not None:
									print bill.p
						except UnicodeEncodeError:
							pass
Example #21
0
 def analyze_iframe(self,iframe_ana):
     iframe_child_ans=[]
     iframe_size=[]
     object_cnt=0
     embed_cnt=0
     h = iframe_ana.get('height')
     w = iframe_ana.get('width') 
     try:
         if((h.isdigit()) and (w.isdigit())):
             iframe_size.append(h)
             iframe_size.append(w)
         elif((len(h)>0) and (len(w)>0)):
             iframe_escaped = True		
     except Exception:
         pass
     new_url = iframe_ana.get('src')
     file_type = self.getContentType(new_url)
     #checking if the url points to an html page
     if('html' in file_type):
         iframe_child_ans.append(iframe_size)
         child_get = urllib2.urlopen(new_url).read()
         child_dom = BeautifulSoup(child_get)
         object_data = child_dom.findAll('object')
         embed_data = child_dom.findAll('embed')
         for i in object_data:
             object_cnt = int(object_cnt)+1
         for i in embed_data:
             embed_cnt = int(embed_cnt)+1
         iframe_child_ans.append(object_cnt)
         iframe_child_ans.append(embed_cnt)
         iframe_src = iframe_ana.get('src')
         iframe_child_ans.append(iframe_src)
         return iframe_child_ans
     else:
         return "not_dynamic"
Example #22
0
	def _getAllHrefsFromPage(self, url, pageSource):
		'''解析html源码,获取页面所有链接。返回链接列表'''
		#print 'ok3'
		hrefs = []
		soup = BeautifulSoup(pageSource)
		#print 'soup=',soup
		
		#print results
		# 1. as <a href=http://www.example.com></a>
		results = soup.findAll('a',href=True)
		for a in results:
			#必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
			#在bs4中不会被自动url编码,从而导致encodeException
			href = a.get('href').encode('utf8')
			if not href.startswith('http'):
				href = urljoin(url, href)#处理相对链接的问题
			if href not in hrefs:
				hrefs.append(href)
		
		# 2. as <from action=http://www.example.com></form>
		results = soup.findAll('form',action=True)
		for form in results:
			href = form.get('action').encode('utf8')
			if not href.startswith('http'):
				href = urljoin(url, href)#处理相对链接的问题
			if href not in hrefs:
				hrefs.append(href)

		return hrefs
def mouthsnap_spider(max_pages):
    page = 1
    while page <= max_pages:
        #page = 1
        url = 'http://www.mouthshut.com/product-reviews/Snapdeal-com-reviews-925602969-sort-MsDate-order-d-page-' + str(page)
        print (url)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl03_lnkTitle'}):
            href3 = link.get('href')
            title3 = link.string
            #print (href3)
            print (title3)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl01_lnkTitle'}):
            href1 = link.get('href')
            title1 = link.string
            #print (href1)
            print (title1)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl02_lnkTitle'}):
            href2 = link.get('href')
            title2 = link.string
            #print (href1)
            print (title2)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl04_lnkTitle'}):
            href1 = link.get('href')
            title4 = link.string
            #print (href1)
            print (title4)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl05_lnkTitle'}):
            href5 = link.get('href')
            title5 = link.string
            #print (href1)
            print (title5)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl06_lnkTitle'}):
            href6 = link.get('href')
            title6 = link.string
            #print (href1)
            print (title6)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl07_lnkTitle'}):
            href7 = link.get('href')
            title7 = link.string
            #print (href1)
            print (title7)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl08_lnkTitle'}):
            href8 = link.get('href')
            title8 = link.string
            #print (href1)
            print (title8)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl09_lnkTitle'}):
            href9 = link.get('href')
            title9 = link.string
            #print (href1)
            print (title9)
        for link in soup.findAll('a', {'id': 'ctl00_ctl00_ctl00_ContentPlaceHolderHeader_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_ctl10_lnkTitle'}):
            href10 = link.get('href')
            title10 = link.string
            #print (href1)
            print (title10)
        page += 1
Example #24
0
def parseHTML(html):
    # cooking some soup
    bsObj = BeautifulSoup(html,"html.parser")

    # the main results table is the second on the page
    resultsTable = bsObj.findAll("table")[1]

    # the results stats table is the third on the page
    statsTable = bsObj.findAll("table")[2]

    # get rows from the various tables in format that's
    # easy to iterate on
    resultsRows = scrapeTable(resultsTable)
    analysisRows = scrapeTable(statsTable)

    # outputting the main results table
    print "\n\n"
    for row in resultsRows:
        if len(row) > 0:
            print "%50s%5s%5s%5s"%(row[1],row[2],row[3],row[4])
            print "----------------------------------------------------------------------\n"

    print "\n\n"

    # outputting the results analysis table
    print "%40s%15s%15s"%(" ","SEMESTER","CUMULATIVE")
    for row in analysisRows:
        if len(row) > 0:
            print "%40s%15s%15s"%(row[1],row[2],row[3])
Example #25
0
def make_quiz(source, destination):
    """
    Extracting from bjc file
    """

    filename = source.rsplit('/', 1)[1]
    test_path = source
    soup = BeautifulSoup(open(test_path))

    """
    make sure this is a multiple choice quiz
    """

    if soup.find("div", { "class" : "prompt" }) == None:
        return

    prompt = ((soup.find("div", { "class" : "prompt" }).get_text()).encode('utf-8', "ignore")).strip()
    correct_answer_tag = soup.find("div", { "class" : "correctResponse" })
    correct_answer = ((soup.find(identifier=correct_answer_tag['identifier']).find("div", { "class" : "text" }).get_text()).encode('utf-8', "ignore")).strip()
    answer_list_unf = soup.findAll("div", { "class" : "text" })
    answer_list = []
    for a in answer_list_unf:
        answer_list.append(((a.get_text()).encode('utf-8', "ignore")).strip())

    feedback_list_unf = soup.findAll("div", { "class" : "feedback" })
    feedback_list = []
    for f in feedback_list_unf:
        feedback_list.append(((f.get_text()).encode('utf-8', "ignore")).strip())

    """
    Formatting for xml
    """

    xml_mul = ""
    for answer in answer_list:
        if answer == correct_answer:
            xml_mul += "<    choice correct=\"true\">" + str(answer) + "</choice>\n"
        else:
            xml_mul += "<    choice correct=\"false\">" + str(answer) + "</choice>\n"

    xml_out =     "<problem>\n" + \
                "<p>" + str(prompt) + "</p>\n" + \
                "<multiplechoiceresponse>\n" + \
                "  <choicegroup type=\"MultipleChoice\">\n" + \
                str(xml_mul) + \
                "  </choicegroup>\n" + \
                "</multiplechoiceresponse>\n\n" + \
                "<solution>\n" + \
                "<div class=\"detailed-solution\">\n" + \
                "<p>Explanation</p>\n" + \
                "<p>" + str(feedback_list[answer_list.index(correct_answer)]) + "</p>\n" + \
                "</div>\n" + \
                "</solution>\n" + \
                "</problem>\n"


    output = destination + '/problem/' + filename[:-5] + ".xml"
    # print(output)
    with open(output, 'w+') as xml_file:
        xml_file.write(xml_out)
Example #26
0
def TSTVShows(params):
	try:
		html = BeautifulSoup(re.sub('\s+', ' ', HTML(params['url'])))
		try:
			mode = params['search']
			tvshows = html.findAll('li')
			if len(tvshows) > 0:
				for tvshow in tvshows:
					XBMCItemAdd({'title':tvshow.a.string.encode('utf-8')},
						{
							'func' : 'TSSeasons',
							'title': tvshow.a.string.encode('utf-8'),
							'url'  : tvshow.a['href']
						})
				XBMCEnd()
			else:
				Noty('TS.KG', 'Видео не найдено', ImagePath('noty-tskg.png'))
		except:
			tvshows = html.findAll('div', attrs={'class':'categoryblocks'})
			if len(tvshows) > 0:
				for tvshow in tvshows:
					XBMCItemAdd({'title':tvshow.a.img['title'].encode('utf-8'), 'thumb':tvshow.a.img['src']},
						{
							'func' : 'TSSeasons',
							'title': tvshow.a.img['title'].encode('utf-8'),
							'url'  : tvshow.a['href'],
							'thumb': tvshow.a.img['src']
						})
				XBMCEnd()
			else:
				Noty('TS.KG', 'Видео не найдено', ImagePath('noty-tskg.png'))
	except:
		Noty('TS.KG', 'Сервер недоступен', ImagePath('noty-tskg.png'))
Example #27
0
def replaceURL(URL,OUTPUT):
	# Provide user feedback
	print("[+] Replacing URLs...")
	print("[+] URLs that will be replaced:")
	# Open source, read lines, and begin parsing to replace all URLs inside <a> tags with href
	try:
		# Print href URLs that will be replaced
		print("\n".join(re.findall('<a href="?\'?([^"\'>]*)', open(OUTPUT).read())))
		with open(OUTPUT, "r") as html:
			# Read in the source html and parse with BeautifulSoup
			soup = BeautifulSoup(html)
			# Find all links and replace URLs with our new text/URL
			for link in soup.findAll('a', href=True):
				link['href'] = '{{links.phishgate}}'
			for link in soup.findAll('link', href=True):
				link['href'] = urllib.parse.urljoin(URL, link['href'])
			for link in soup.findAll('script', src=True):
				link['src'] = urllib.parse.urljoin(URL, link['src'])
			source = soup.prettify()
			source = xml.sax.saxutils.unescape(source)
			# Write the updated URLs to the output file while removing the [' and ']
			output = open(OUTPUT, "w")
			output.write(source.replace('[','').replace(']',''))
			output.close()
			print("[+] URL parsing successful. URLs replaced.")
	except:
		print("[-] URL parsing failed. Make sure the html file exists and is readable.")
Example #28
0
def validate_jobposting(url):
    content = requests.get(url, verify=False).content
    soup = BeautifulSoup(content)

    # Look for any of the 3 types of JobPosting markups
    job_posting_found = []
    # Case 1: Microdata
    job_posting_found.append(
        soup.findAll('div', {'itemtype' : 'http://schema.org/JobPosting'})
    )

    # Case 2: RDFa
    job_posting_found.append(
        soup.findAll('div', {
            'vocab' : 'http://schema.org/',
            'typeof': 'JobPosting',
        })
    )

    # Case 3: JSON-LD
    ld_jsons = soup.findAll('script', {
        'type' : 'application/ld+json',
    })
    for ld in ld_jsons:
        ld_json = json.loads(ld.string)
        job_posting_found.append(ld_json.get("@type", '') == "JobPosting")

    return any(job_posting_found)
Example #29
0
def btdigg_page(query, sort, page):
    from bs4 import BeautifulSoup
    from xbmctorrent.utils import url_get

    html_data = url_get("%s/search" % BASE_URL, headers=HEADERS, params={
        "order": sort,
        "q": query,
        "p": page,
    })
    soup = BeautifulSoup(html_data, "html5lib")
    name_nodes = soup.findAll("td", "torrent_name")
    attr_nodes = soup.findAll("table", "torrent_name_tbl")[1::2]

    for name_node, attr_node in zip(name_nodes, attr_nodes):
        attrs = attr_node.findAll("span", "attr_val")
        title = "%s (%s, DLs:%s)" % (name_node.find("a").text, attrs[0].text, attrs[2].text)
        yield {
            "label": title,
            "path": plugin.url_for("play", uri=attr_node.find("a")["href"]),
            "is_playable": True,
        }
    yield {
        "label": ">> Next page",
        "path": plugin.url_for("btdigg_page", query=query, sort=sort, page=int(page) + 1),
        "is_playable": False,
    }
Example #30
0
def hores(direccio):
	llista_hores = []
	url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=0"
	sock = urllib.urlopen(url)
	pagina = BeautifulSoup(sock.read(), "lxml")
	linies_vol = pagina.findAll("div", {"id": "flight_detail"})
	for linea_vol in linies_vol:
        	hores = linea_vol.findAll("div", {"id": "fhour"})
        	for hora in hores:
                	llista_hores.append(hora.text.strip())
	url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=6"
	sock = urllib.urlopen(url)
	pagina = BeautifulSoup(sock.read(), "lxml")
	linies_vol = pagina.findAll("div", {"id": "flight_detail"})
	for linea_vol in linies_vol:
        	hores = linea_vol.findAll("div", {"id": "fhour"})
        	for hora in hores:
                	llista_hores.append(hora.text.strip())
	url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=12"
	sock = urllib.urlopen(url)
	pagina = BeautifulSoup(sock.read(), "lxml")
	linies_vol = pagina.findAll("div", {"id": "flight_detail"})
	for linea_vol in linies_vol:
        	hores = linea_vol.findAll("div", {"id": "fhour"})
        	for hora in hores:
                	llista_hores.append(hora.text.strip())
	url = "http://www.barcelona-airport.com/cat/"+direccio+".php?tp=18"
	sock = urllib.urlopen(url)
	pagina = BeautifulSoup(sock.read(), "lxml")
	linies_vol = pagina.findAll("div", {"id": "flight_detail"})
	for linea_vol in linies_vol:
        	hores = linea_vol.findAll("div", {"id": "fhour"})
        	for hora in hores:
                	llista_hores.append(hora.text.strip())
	return llista_hores
Example #31
0
    IDX.append(index)
    A.append(team)
    B.append(abbr)


url = "https://www.reddit.com/r/CFB/wiki/abbreviations"

print("Scrape Abbreviations Tool")
print("**************************")
print("data is from {0}".format(url))
print("Directory Location: {0}".format(settings.data_path))
print("**************************")

with contextlib.closing(urlopen(url)) as page:
    soup = BeautifulSoup(page, "html5lib")
tables = soup.findAll("table")

IDX = []
A = []
B = []

# Add any Missing Teams Here
AddSchool("ALABAMA-BIRMINGHAM", "UAB")
AddSchool("ALABAMA A&M", "AAMU")
AddSchool("ALBANY-NY", "ALBY")
AddSchool("WESTERN KENTUCKY", "WKU")
# Add any Missing Teams Here
for row in tables[0].findAll("tr"):
    col = row.findAll('td')
    if len(col) > 0:
        tag = str(col[0].find(text=True)).strip()
Example #32
0
def get_rent_perregion(city, district):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"zufang/%s/" % district
    # logging.info("checking url: %s", url)
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"zufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByRegionlist", district, page + 1, total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update(
                        {u'title': housetitle.h2.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get("href")})
                    houseID = name.get("data-housecode")
                    info_dict.update({u'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({u'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({u'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({u'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway == None:
                        info_dict.update({u'subway': ""})
                    else:
                        info_dict.update(
                            {u'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration == None:
                        info_dict.update({u'decoration': ""})
                    else:
                        info_dict.update({
                            u'decoration':
                            decoration.span.get_text().strip()
                        })

                    heating = name.find("span", {"class": "heating-ex"})
                    if decoration == None:
                        info_dict.update({u'heating': ""})
                    else:
                        info_dict.update(
                            {u'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})

                except Exception as e:
                    traceback.print_exc()
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            if data_source:
                logging.info("checking rent info: %s", ''.join(data_source))
                logging.info("inserting rent info into db")
                model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
Example #33
0
def get_house_perregion(city, district):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update(
                        {u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    if city == 'bj':
                        info = houseinfo.get_text().split('/')
                    else:
                        info = houseinfo.get_text().split('|')
                    info_dict.update({u'community': info[0]})
                    info_dict.update({u'housetype': info[1]})
                    info_dict.update({u'square': info[2]})
                    info_dict.update({u'direction': info[3]})
                    info_dict.update({u'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    info_dict.update({u'years': housefloor.get_text().strip()})
                    info_dict.update({u'floor': housefloor.get_text().strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        info_dict.update({u"taxtype": ""})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data-price")})
                except Exception as e:
                    traceback.print_exc()
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                # model.Houseinfo.insert(**info_dict).upsert().execute()
                #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

        with model.database.atomic():
            if data_source:
                model.Houseinfo.insert_many(data_source).upsert().execute()
            if hisprice_data_source:
                model.Hisprice.insert_many(
                    hisprice_data_source).upsert().execute()
        time.sleep(1)
Example #34
0
def get_community_perregion(city, regionname=u'xicheng'):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    # logging.info('checking raw response')
    # print(source_code)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page
            logging.info("fetching from %s", url_page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        # logging.info("querying for page %d content", page)
        nameList = soup.findAll("li", {"class": "xiaoquListItem"})
        # logging.info("checking community list length: %d", len(nameList))
        i = 0
        log_progress("GetCommunityByRegionlist", regionname, page + 1,
                     total_pages)
        data_source = []
        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                title = communitytitle.get_text().strip('\n')
                link = communitytitle.a.get('href')
                info_dict.update({u'title': title})
                info_dict.update({u'link': link})

                district = name.find("a", {"class": "district"})
                info_dict.update({u'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({u'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {u'onsale': onsale.span.get_text().strip('\n')})

                onrent = name.find("a", {"title": title + u"租房"})
                info_dict.update(
                    {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                info_dict.update({u'id': name.get('data-housecode')})

                price = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'price': price.span.get_text().strip('\n')})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.items():
                    info_dict.update({key: value})

                info_dict.update({u'city': city})
                # logging.info('community info: %s', json.dumps(info_dict))
            except Exception as e:
                traceback.print_exc()
                continue
            # communityinfo insert into mysql
            data_source.append(info_dict)
            # model.Community.insert(**info_dict).upsert().execute()
        with model.database.atomic():
            if data_source:
                # logging.info("checking data: %s", ''.join(data_source))
                # logging.info("inserting community info into db")
                model.Community.insert_many(data_source).upsert().execute()
                # logging.info("insertion succeeds")
        time.sleep(1)
Example #35
0
def get_sell_percommunity(city, communityname):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"chengjiao/rs" + \
        urllib2.quote(communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Sellinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + \
                u"chengjiao/pg%drs%s/" % (page,
                                          urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        log_progress("GetSellByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "listContent"}):
            for name in ultag.find_all('li'):
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('href').split("/")[-1].split(
                        ".")[0]
                    info_dict.update({u'houseID': houseID.strip()})

                    house = housetitle.get_text().strip().split(' ')
                    info_dict.update({u'community': communityname})
                    info_dict.update({
                        u'housetype':
                        house[1].strip() if 1 < len(house) else ''
                    })
                    info_dict.update({
                        u'square':
                        house[2].strip() if 2 < len(house) else ''
                    })

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')
                    info_dict.update({u'direction': info[0].strip()})
                    info_dict.update(
                        {u'status': info[1].strip() if 1 < len(info) else ''})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    floor_all = housefloor.get_text().strip().split(' ')
                    info_dict.update({u'floor': floor_all[0].strip()})
                    info_dict.update({u'years': floor_all[-1].strip()})

                    followInfo = name.find("div", {"class": "source"})
                    info_dict.update(
                        {u'source': followInfo.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    if totalPrice.span is None:
                        info_dict.update(
                            {u'totalPrice': totalPrice.get_text().strip()})
                    else:
                        info_dict.update({
                            u'totalPrice':
                            totalPrice.span.get_text().strip()
                        })

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    if unitPrice.span is None:
                        info_dict.update(
                            {u'unitPrice': unitPrice.get_text().strip()})
                    else:
                        info_dict.update(
                            {u'unitPrice': unitPrice.span.get_text().strip()})

                    dealDate = name.find("div", {"class": "dealDate"})
                    info_dict.update({
                        u'dealdate':
                        dealDate.get_text().strip().replace('.', '-')
                    })

                except Exception as e:
                    traceback.print_exc()

                    continue
                # Sellinfo insert into mysql
                data_source.append(info_dict)
                # model.Sellinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            if data_source:
                model.Sellinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
for _ in range(10000):
    afterstring = htmlstring
    actions.send_keys(Keys.PAGE_DOWN).perform()
    htmlstring = browser.page_source
    if afterstring == htmlstring:
        print ("ended scraping crack test one")
        actions.send_keys(Keys.PAGE_DOWN).perform()
        htmlstring = browser.page_source
        if afterstring == htmlstring:
           print ("--Scrapping End--")
           break
    time.sleep(3)
    

#print(htmlstring)
textdoc = io.open("gmapreview.txt", "w", encoding="utf-8")
soup = BeautifulSoup(htmlstring,"lxml")
mydivs = soup.findAll("div", {"class": "section-review-content"})
counter = 0
for a in mydivs: 
    textdoc.write(str("\nReviewer name: "+a.find("div", class_="section-review-title").text)+" \n||Reviewer Detail: " + str(a.find("div", class_="section-review-subtitle").text) +" \n||Reviewerer Profile URL:"+ str(a.find("a").get('href')))
    
    textdoc.write(" \n||" + a.find("span", class_="section-review-text").text+" \n|| " + a.find("span", class_="section-review-publish-date").text)
    textdoc.write("=========================================\n")
    counter = counter + 1
print ("Total reviews scraped:"+str(counter))
textdoc.close()
    
#actions.send_keys(Keys.PAGE_DOWN).perform()

#browser.execute_script('')
Example #37
0
def change_label_number():
    strLabel = tk.Label(window, text='處理中...')
    strLabel.pack(anchor='center')
    window.update()
    global url
    global zipfileName
    global comboExample
    comboExampleget = fileTypeListbox.get(fileTypeListbox.curselection())
    url = 'https://www.fda.gov/MedicalDevices/ProductsandMedicalProcedures/DeviceApprovalsandClearances/510kClearances/ucm089428.htm'
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html5lib')
    OBDataUrl = {
        i.a.text: i.a['href']
        for i in soup.findAll('p') if i.find(text=re.compile('-'))
    }
    strLabel2 = tk.Label(window, text='Downloads 510K Data.')
    strLabel2.pack(anchor='center')
    window.update()
    for K, v in tqdm(OBDataUrl.items(),
                     total=len(OBDataUrl),
                     ascii=True,
                     desc='Downloads 510K Data.'):
        urllib.request.urlretrieve(v, K)
    strLabel3 = tk.Label(window, text='Downloads 510K Data Done.')
    strLabel3.pack(anchor='center')
    window.update()
    all510kdatalist = []
    for j in tqdm(OBDataUrl, ascii=True, desc='Loading 510K Data'):
        with zipfile.ZipFile(j, 'r') as zipFile:
            txtfile = j.lower().replace('.zip', '.txt')
            fileio = io.StringIO(zipFile.read(txtfile).decode('cp1252'))
            test01 = pd.read_csv(fileio, sep='|', encoding='utf8')
            all510kdatalist.extend(test01.to_dict('records'))
    strLabel4 = tk.Label(
        window, text='Loading 510K Data to {}'.format(comboExampleget))
    strLabel4.pack(anchor='center')
    window.update()
    all510kDF = pd.DataFrame(all510kdatalist)
    all510kDf = all510kDF.rename(dict(
        zip(all510kDF.columns, [i.title() for i in all510kDF.columns])),
                                 axis=1)
    with open('{}.txt'.format(str(len(all510kDF))), 'w') as txt:
        pass

    # print('510K 額外資訊merge')
    # # 510K 額外資訊merge
    # urllib.request.urlretrieve('http://www.accessdata.fda.gov/premarket/ftparea/foiclass.zip', 'foiclass.zip')
    # with zipfile.ZipFile('foiclass.zip', 'r') as zipFile:
    #     fileio = io.StringIO(zipFile.read('foiclass.txt').decode('cp1252'))
    #     test01 = pd.read_csv(fileio, sep='|', encoding='utf8')
    # test01.rename(
    #     {'REVIEW_PANEL': 'Reviewadvisecomm', 'PRODUCTCODE': 'Productcode', 'DEVICENAME': 'DEVICENAME_ADJ'},
    #     axis=1, inplace=True)
    # full510k = pd.merge(all510kDf, test01, how='left', on=['Reviewadvisecomm', 'Productcode'])
    try:
        filetypesSelect(all510kDf, '510k', comboExampleget, DateTimeSTR)
        window.quit()
    except Exception:
        window2 = tk.Tk()
        window2.title('錯誤提示')
        window2.geometry('400x300')
        error_Text = ''
        e_type, e_value, e_traceback = sys.exc_info()
        error_Text += f'''錯誤訊息如下:
                        Errortype ==> {e_type.__name__}
                        ErrorInfo ==> {e_value}
                        ErrorFileName ==> {e_traceback.tb_frame.f_code.co_filename}
                        ErrorLineOn ==> {e_traceback.tb_lineno}
                        ErrorFunctionName ==> {e_traceback.tb_frame.f_code.co_name}'''
        with open('errorFileLog.log', 'w+') as errorFileLog:
            errorFileLog.write(error_Text)
        strLabel2 = tk.Label(window2,
                             text='{}\n{}\n{}'.format(e_type, e_value,
                                                      e_traceback))
        strLabel2.pack(anchor='center')
        window2.mainloop()

    finally:
        pass
# Create an iterator that will cycle through the first 16 articles and skip a few
listIterator = []
listIterator[:] = range(2, 16)

# Print out the results to screen
for i in listIterator:
    print '<h3>' + findPatTitle[i] + '</h3><br />'  # The title
    print "<a href ='" + findPatLink[
        i] + "'>Original Article</a><br />"  # The link to the original article
    print '\n'
    articlePage = urlopen(
        findPatLink[i]).read()  # Grab all of the content from original article

divBegin = articlePage.find('<div>')  # Locate the div provided
article = articlePage[divBegin:(
    divBegin + 1000)]  # Copy the first 1000 characters after the div

# Pass the article to the Beautiful Soup Module
soup = BeautifulSoup(article)

# Tell Beautiful Soup to locate all of the p tags and store them in a list
paragList = soup.findAll('p')

# Print all of the paragraphs to screen
for i in paragList:
    # i = cleanHtml(i)
    i = cleanHtmlRegex(i)
    print i + '<br />'

print '<br /></body></html>'
Example #39
0
#encoding=utf8
import urllib
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent

url = 'http://www.kxdaili.com/ipList/1.html#ip'
req = Request(url,headers=header)
res = urlopen(req).read()

soup = BeautifulSoup(res)
ips = soup.findAll('tr')
f = open("../src/proxy.txt","w")

for x in range(1,len(ips)):
    ip = ips[x]
    tds = ip.findAll("td")
    ip_temp = tds[0].contents[0]+"\t"+tds[4].contents[0]+"\n"
    # print tds[2].contents[0]+"\t"+tds[3].contents[0]
    f.write(ip_temp)
Example #40
0
class Page():
    def __init__(self):
        self.url = ''
        self.html = ''
        self.Ps = []
        self.Spans = []
        self.H1s = []
        self.H2s = []
        self.H3s = []
        self.H4s = []
        self.H5s = []
        self.H6s = []
        self.getalldone = False
        self.links = []
        self.textlist = []

    #run all the main methods in the required order
    def process(self, url):
        self.seturl(url)
        self.extract_html_page()
        self.getAllText()
        self.getLinks()
        self.cleantext()

    #allow to set the url to parse
    def seturl(self, url):
        #add url analysis to pre check
        self.url = url

    #extract HTML content using request and BeautifulSoup
    def extract_html_page(self):
        response = requests.get(self.url)
        self.html = BeautifulSoup(response.content, "html.parser")

    #extract all the <p> paragraphe
    def getP(self):
        AllP = self.html.findAll('p')
        for P in AllP:
            self.Ps.append(P.get_text())

    #extract all the <span> paragraphe
    def getSpan(self):
        Allspan = self.html.findAll('span')
        for span in Allspan:
            self.Spans.append(span.get_text())

    #extract all the <hn> titles
    def getH(self):
        for n in [n + 1 for n in range(6)]:
            AllH = self.html.findAll('h' + str(n))
            for H in AllH:
                exec(f"self.H{str(n)}s.append(H.get_text())")

    #extract all P, span and Hn
    def getAllText(self):
        if self.getalldone == False:
            self.getP()
            self.getSpan()
            self.getH()
            self.getLinks()
            self.getalldone = True

    #combine all the text in one list
    def getAllTextCombined(self):
        self.getAllText()
        Alltext = []

        for n in [n + 1 for n in range(6)]:
            AllH = []
            exec(f"AllH = self.H{str(n)}s")
            for H in AllH:
                Alltext.append(H)

        for span in self.Spans:
            Alltext.append(span)
        for P in self.Ps:
            Alltext.append(P)
        return Alltext

    #extract all the links in the page
    def getLinks(self):
        for link in self.html.findAll(
                'a'):  #attrs={'href': re.compile("^http://")}):
            #print(link.get('href'))
            self.links.append(link.get('href'))

    #put the text into a list of sentence
    def cleantext(self):
        self.textlist = [
            ' '.join(x.split()) for x in self.getAllTextCombined()
        ]
soup=BeautifulSoup(page.content,'html.parser')
find_links=soup.find_all('a')
finalarray=[]
finallinkdict={}
for link in find_links:
    if len(link.text)>0:
        if "INTRODUCING AWS" in str(link.text).upper() or "INTRODUCING AMAZON" in str(link.text).upper():
            finallinkdict["https:"+str(link.attrs['href'])]={"title":link.text,"summary":""}

#we want summarization as well. Take the links, iterate through them, get the text on the page, put it
#in to the summarizer, and spit out a dictionary.

for key in finallinkdict:
    temppage=requests.get(key)
    tempsoup=BeautifulSoup(temppage.content,'html.parser')
    paralist=tempsoup.findAll('p')
    tempsummary=""
    for paragraph in paralist:
        if paragraph.text.count('.')>0:
            tempsummary+=paragraph.text+" "
    fullsummary=summarizer(tempsummary)[0]['summary_text']
    finallinkdict[key]['summary']=fullsummary


finalarray.append(["Title","Summary","URL"])
for key in finallinkdict:
    temparray=[finallinkdict[key]['title'].replace(","," "),finallinkdict[key]['summary'].replace(","," "),key]
    finalarray.append(temparray)

with open("AWS_"+year+"_NEW_SERVICES.csv","a") as f:
    writer=csv.writer(f)
Example #42
0
    if(len(email) > 1):
        print(email[1])
        return email[1]
    return ""


# driver = webdriver.Firefox()
# driver.get('https://hls.harvard.edu/faculty/directory/?l=l')
# print(driver.find_element_by_class_name("faculty-detail-link"))
# content = driver.page_source
f = open("polis.txt", "a")
res = requests.get("https://www.polis.cam.ac.uk/Staff_and_Students/academic-staff")
content = res.content
# print(content)
soup = BeautifulSoup(content)
divs = soup.findAll('div', attrs={'class': 'emailAddress'})
emails = []
# print(divs)
for div in divs:
    # name = a.find('div', attrs={'class':'sfljd'})
    try:
        text = div.find('a').get('href')
        email = print_mail(text)
        emails.append(email)
        f.write(email + "\n")
    except:
        print("cant read")

# print(emails)

# loop = asyncio.get_event_loop()
Example #43
0
##Alex Gagliano
##10/24/2016
##Script for scraping the major OSS projects on OpenHub and pulling their ActivityFacts Objects

from bs4 import BeautifulSoup
import urllib

wf = open('OpenDuckProjects_ActivityScrape.txt', 'a')
projectNames = list()
r = urllib.urlopen('https://www.openhub.net/').read()
soup = BeautifulSoup(r, "lxml")
Soup1 = soup.findAll("div", {"class": "top_ten_link"})[11:20]
for ana in Soup1:
    projectNames.append(str(ana.a.get('href')).replace("/p/", ""))

for name in projectNames:

    #pull data from webpage
    r = urllib.urlopen(
        'https://www.openhub.net/projects/' + name +
        '/analyses/latest/activity_facts.xml?api_key=d32768dd2ec65efd004d19a9f3c7262d7f30cd8959d9009ce4f9b8e7e19ff0ef&v=1'
    ).read()
    soup = BeautifulSoup(r, "lxml")

    for item in soup.findAll('activity_fact'):
        tempDate = str(item('month')[0].text)
        tempCommentsA = str(item('comments_added')[0].text)
        tempCommentsR = str(item('comments_removed')[0].text)
        tempCodeA = str(item('code_added')[0].text)
        tempCodeR = str(item('code_removed')[0].text)
        tempCommits = str(item('commits')[0].text)
Example #44
0
print("Opened queue: %s" % queue_url)

while True:
    print("Attempting to receive messages")
    response = sqs.receive_message(QueueUrl=queue_url,
                                   MaxNumberOfMessages=1,
                                   WaitTimeSeconds=1)
    if not 'Messages' in response:
        print("No messages")
        continue

    message = response['Messages'][0]
    receipt_handle = message['ReceiptHandle']
    url = message['Body']

    # parse the page
    html = requests.get(url)
    bsobj = BeautifulSoup(html.text, "lxml")

    # now find the planet name and albedo info
    planet = bsobj.findAll("h1", {"id": "firstHeading"})[0].text
    albedo_node = bsobj.findAll("a", {"href": "/wiki/Geometric_albedo"})[0]
    root_albedo = albedo_node.parent
    albedo = root_albedo.text.strip()

    # delete the message from the queue
    sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)

    # print the planets name and albedo info
    print("%s: %s" % (planet, albedo))
Example #45
0
import requests
from bs4 import BeautifulSoup

sach = {'q': 'Python', 'users': '1000'}
url = 'http://b.hatena.ne.jp/search/text'
req = requests.get(url, params=sach, timeout=15)
print(req)

soup = BeautifulSoup(req.text, 'html.parser')

bookmarks = []

for b in soup.findAll('h3', {'class': ''}):
    title = b.find('a').get('title')
    url = b.find('a').get('href')
    bookmarks.append([title, url])

print(bookmarks)
Example #46
0
kw = 'pokemon'  # 在此次可以修改搜索关键词
limit = 3  # 设置图片数量-保护眼睛-后面的壁纸质量简直没法看...

root_path = r'.\imgout\\' + str(kw)
j = 1  # 全局计数器

# 构造6页循环 主界面查找循环 按照喜欢的排列
for i in range(1, 50):
    url = 'https://wallhaven.cc/search?q=' + kw + '&categories=111&purity=100&sorting=favorites&order=desc&page=' + str(
        i)
    # url参数 按热度排序 降序
    html = requests.get(url, headers=header)
    html.encoding = chardet.detect(html.content)['encoding']
    text = html.text
    soup = BeautifulSoup(html.text, "html.parser")
    data = soup.findAll(name='a', attrs={"href": re.compile(r'^https://.*(w\/).*')})
    # 首次循环 获取所有图片的链接
    if data == '[]':
        continue  # 为了省时 没有搜索的话此次循环会被直接跳过

    for sn in data:
        time.sleep(random.randint(1, 5))  # 随机延迟
        url1 = str(sn['href'])
        html1 = requests.get(url1, headers=header)
        html1.encoding = chardet.detect(html1.content)['encoding']
        text1 = html1.text
        soup1 = BeautifulSoup(html1.text, "html.parser")
        data1 = soup1.findAll(name='img', attrs={"src": re.compile(r'^https://.*jpg$')})
        # 根据之前的url构造进入二级链接的url 获取图片的详细地址
        if data1 == '[]':
            continue
Example #47
0
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.runoob.com/html/html-intro.html'
response = requests.get(url)
html = response.text.encode(response.encoding).decode()
# print(html)
soup = BeautifulSoup(html, 'lxml')
l = [x.text for x in soup.findAll('h2')]
# print(l)
df = pd.DataFrame(l, columns=[url])
# link = soup.findAll('a')[1]
# link.has_attr('href')
# print(link)
# print(link.attrs['href'])
links = [i for i in soup.findAll('a')
         if i.has_attr('href') and i.attrs['href'][0:5] == '/html']
# print(links)
relative_urls = set([i.attrs['href'] for i in links])
# print(relative_urls)
# relative_urls.to_excel('url.xlsx')
absolute_urls = {'http://www.runoob.com' + i for i in relative_urls}
absolute_urls.discard(url)
# print(absolute_urls)

for i in absolute_urls:
    ri = requests.get(i)
    soupi = BeautifulSoup(ri.text.encode(ri.encoding), "lxml")
    li = [x.text for x in soupi.findAll('h2')]
    dfi = pd.DataFrame(li, columns=[i])
    df = df.join(dfi, how='outer')
Example #48
0
def xiaoqu_page_search(db_xq,url_page=u"https://gz.lianjia.com/zufang/tianhe/pg1/"):

    trytimes = 0
    while 1:
        try:
            req = urllib2.Request(url_page,headers=hds[random.randint(0,len(hds)-1)])
            source_code = urllib2.urlopen(req,timeout=10).read()
            plain_text=unicode(source_code)#,errors='ignore')   
            soup = BeautifulSoup(plain_text)
        except socket.timeout as e:
            if trytimes < 5:
                time.sleep(3)
                trytimes += 1
                continue
            else:
                print e
                exception_write(e, 'xiaoqu_page_search', url_page)
                return 
        except (urllib2.HTTPError, urllib2.URLError) as e:
            print e
            exception_write(e, 'xiaoqu_page_search', url_page)
            return 
        except Exception as e:
            print e
            exception_write(e, 'xiaoqu_page_search', url_page)
            return
        
        human = soup.find('div',{'class':'human'})
        
        if not human:
            break
        else:
            print "block && wait"
            time.sleep(600)
            trytimes = 0
    
    xiaoqu_list = soup.findAll('li',{'data-el':'zufang'})
    
    j = 0
    for j in range(len(xiaoqu_list)):
        xq = xiaoqu_list[j]
        try:
            info_dict = {}

            where = xq.find('div',{'class':'where'})
            href = unicode(where.a['href'])
            longname = where.a.text
                
            info_dict[u'href'] = href
            info_dict[u'name'] = longname.strip()
    
        except Exception as e:
            print e
            exception_write(e, 'xiaoqu_page_search', str(j))
            continue
        
        print j
        try:
            command = gen_xiaoqu_insert_command(info_dict)
            db_xq.execute(command)
        except Exception as e:
            print e
            exception_write(e, 'xiaoqu_page_search_db', str(j))
            continue
Example #49
0
def crawlSearchPage(nextPage, db):
    foundItems = set()

    while True:
        pageNumberMatch = PATTERN_URL_PAGE_NUMBER.search(nextPage)
        if pageNumberMatch:
            print("Mining: {}".format(pageNumberMatch.group(1)))
        else:
            print("Mining: 1 (initial)")

        try:
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:48.0) Gecko/20100101 Firefox/48.0'
            }
            html = requests.get(nextPage, headers=headers)
            if isRobotDetected(html.content.decode('utf-8')):
                return
        except Exception as e:
            print("Failed to urlopen with {}".format(e))
            return

        htmlContent = html.content
        hexDigestOfNextPage = hashlib.md5(nextPage.encode('utf-8')).hexdigest()
        print("{}: {}".format(hexDigestOfNextPage, nextPage))
        with open(
                os.environ['DIR-WORKING'] +
                "/{}.html".format(hexDigestOfNextPage), 'w') as outfile:
            outfile.write(htmlContent.decode("utf-8"))

        bsObj = BeautifulSoup(htmlContent, "html5lib")

        # iterate inside of the single search page, over items on the page
        for itemDiv in bsObj.findAll(
                "div", {"class": re.compile(".*x-body--resultitem.*")}):
            # TODO: check with md5 for changes in the block
            summary = itemDiv.findAll(
                "div", {"class": re.compile(".*g-col-9.*")})[0].get_text()

            aList = itemDiv.findAll("a",
                                    {"class": re.compile(".*result-item.*")})

            try:
                a = aList.pop()
            except Exception as e:
                print("Failed on URL: {} with {}".format(nextPage, e))
                continue

            id_ = a.attrs["data-ad-id"]

            if db.find_one({"id": id_}):
                # we assume, that the items on the search page was ordered by creation time.
                # as consequence, break out as soon as we see a known item.
                return foundItems
                # continue

            item = {
                "id": id_,
                "isMined": False,
                "summary": summary,
                "firstSeenOn": datetime.now(),
                "uri": a.attrs["href"]
            }

            db.insert(item)
            foundItems.add(a)

        # get the url to the next page of search results
        nextPageSpan = bsObj.findAll(
            "span", {"class": re.compile(".*next-resultitems-page.*")})
        if len(nextPageSpan) <= 0:
            return foundItems

        nextPage = nextPageSpan[0].attrs["data-href"]

    return foundItems
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


r = requests.get(
    "https://marvelcinematicuniverse.fandom.com/wiki/J.A.R.V.I.S./Quote")

soup = BeautifulSoup(r.text, 'html.parser')

quotes = soup.findAll("div", {"class": "quote"})

quotes_dict = {"quotes": []}

with open("jarvis_quotes.json", "w") as file:
    for quote in quotes:
        inner_quote = quote.dl.dd.span.i

        quotes_list = str(inner_quote).split("<br/>")
        for q in quotes_list:
            quotes_dict["quotes"].append({
                "quote":
                strip_tags(q).replace("\"", ""),
                "type":
                0
            })
Example #51
0
	def openFolder(self, folder_id, path='', parent='root'):

		response = self.br.open('%s%s' % (self.url['folder'], folder_id)).read()

		soup = BeautifulSoup(response, 'html.parser')
		data = soup.findAll('a', href=re.compile('^/(Folder|File|note|essay|LearningToolElement)/'))

		course_num = self.current_course_num
		course_data = self.courses[course_num]

		items = {}

		for item in data:
			item_link = item.get('href')
			item_name = item.text.strip()
			item_id = re.findall(r'\d+', item_link)[0]
			
			if item_id == parent:
				continue

			item_type = 'unknown'
			if 'Folder' in item_link: item_type = 'folder'
			if 'File' in item_link: item_type = 'file'
			if 'note' in item_link: item_type = 'note'
			if 'essay' in item_link: item_type = 'essay'
			if 'LearningToolElement' in item_link: item_type = 'tool'

			more_info = False

			if item_type == 'folder':
				course_data['num_folders'] = course_data['num_folders'] + 1
				new_path = path + '/' + item_name
				more_info = self.openFolder(item_id, new_path, folder_id)
			elif item_type == 'file':
				more_info = self.getFile(item_id)
			elif item_type == 'tool':
				more_info = self.getTool(item_id)
			elif item_type == 'note':
				more_info = self.getNote(item_id)
			elif item_type == 'essay':
				more_info = self.getEssay(item_id)
			
			act = u'Åpner' if item_type == 'folder' else 'Leser'
			stat = '%s \'%s\' (%s)' % (act, item_name, item_type)

			if not more_info:
				self.printStatus(stat + ' [' + color.yellow + 'uten innhold' + color.end + ']')
				continue
			
			self.printStatus(stat)

			items[item_id] = { 'data': [item_name, item_type], 'path': path }

			if len(more_info) > 0:
				items[item_id] = dict(items[item_id].items() + more_info.items())

			course_data['max_item_len'] = max(course_data['max_item_len'], len(u''+item_name))
			course_data['tree_depth'] = max(course_data['tree_depth'], path.count('/'))
			if item_type in ['file', 'note', 'essay', 'tool']:
				course_data['num_items'] = course_data['num_items'] + 1

			time.sleep(self.cfg['loop_delay'])

		self.courses[course_num] = course_data
		
		return items
Example #52
0
# 네이버 실시간 검색어 클로링 프로그램

from bs4 import BeautifulSoup
import requests
from datetime import datetime

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
url = "https://datalab.naver.com/keyword/realtimeList.naver?age=20s"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

results = soup.findAll('span', 'item_title')

print(response.text)

search_rank_file = open("rankresult.txt", "a")

print(datetime.today().strftime("%Y년 %m월 %d일의 실시간 검색어 순위입니다.\n"))

rank = 1
for result in results:
    search_rank_file.write(str(rank) + "위:" + result.get_text() + "\n")
    print(rank, "위 : ", result.get_text(), "\n")
    rank += 1
import urllib
from bs4 import BeautifulSoup

print ("Collecting data from IMDb charts....\n\n\n")
print ("The current top 15 IMDB movies are the following: \n\n")
response = urllib.request.urlopen("http://www.imdb.com/chart/top")
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
mytd = soup.findAll("td", {"class":"titleColumn"})
for titles in mytd[:15]:
    print (titles.find('a').text)
print ("\n\nThank you for using IMDB script ...")
Example #54
0
#twitter crawling

theurl = "https://twitter.com/realDonaldTrump"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")

# <title> 태그를 찾아서 프린트
#print(soup.title.text) # 이렇게 해도 됨
print(soup.find("title").text)

# <a> 태그를 찾는다.
#print(soup.findAll('a'))
""""
# <a> 태그의 href 속성의 값을 출력한다.
for link in soup.find_all('a'):
    #print(link.get('href'))
    print(link.text) # text는 text부분을 가져오는 듯..
"""

# <div class="ProfileHeaderCard">로 되어있을 때
#print(soup.find('div',{"class":"ProfileHeaderCard"})) #div 태그안에 class속성의 값은 ProfileHeaderCard 입력시 이렇게 씀
#print(soup.find('div',{"class":"ProfileHeaderCard"}).find('p')) # .find('p')를 하면 p태그인 것을 찾아냄
#print(soup.find('div',{"class":"ProfileHeaderCard"}).find('p').text) # .text를 하면 p태그의 text부븐을 찾아냄

i = 1
for tweets in soup.findAll('div', {"class": "content"}):
    print(i, ": ", tweets.find('p'))
    #print(i , ": ", tweets.find('p').text)
    i = i + 1
Example #55
0
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []
            if url == None or len(
                    url
            ) == 0:  # if no link returned in movie and tvshow searches, nothing to do here, return out.
                log_utils.log('inif?')
                return sources

            # Grab title and year (cant use imdb code here)
            url = urlparse.parse_qs(url)
            title = url['title'][0]
            year = url['year'][0]

            # Create search link
            searchlink = self.search_link = self.search_link + title + ' ' + year
            url = urlparse.urljoin(self.base_link, searchlink)

            html = self.scraper.get(url).content  # Get the HTML for the page
            soup = BeautifulSoup(html, "html.parser")
            # Find all search results and add to array
            results = soup.findAll("div", {"class": "result-item"})
            result_links = []
            for result in results:
                result_links.append(result.find("a", href=True)['href'])

            # Go over search results and find their sources
            for result_link in result_links:
                html = self.scraper.get(result_link).content
                soup = BeautifulSoup(html, "html.parser")
                javascripts = soup.findAll("script",
                                           {"type": "text/javascript"})
                # Lets keep going until we find the one we need
                ids_b64s = []
                for javascript in javascripts:
                    javascript = str(javascript)
                    if "var Player" in javascript and "LoadPlayer" in javascript:
                        # This is the right script
                        # Get the jwplayer-id
                        jw_id = 'jwplayer-' + re.search(
                            'jwplayer-(\d+)', javascript).groups(0)[0]
                        # Get weird b64 string
                        b64_string = re.search('(?<=jwplayer)(.*)(?="\);)',
                                               javascript).groups(0)[0]
                        # Parse into just the b64
                        b64_string = b64_string.split('","')[1]
                        ids_b64s.append([jw_id, b64_string])
                        break
            # Go get the video links
            for id_b64 in ids_b64s:
                the_id = id_b64[0]
                the_b64 = id_b64[1]
                post = {'id': the_id, 'data': b64_string}
                html = self.scraper.post(urlparse.urljoin(
                    self.base_link, '/wp-content/plugins/apiplayer/load.php'),
                                         data=post).content
                soup = BeautifulSoup(html, 'html.parser')
                javascripts = soup.findAll("script",
                                           {"type": "text/javascript"})
                links_qual = []
                for javascript in javascripts:
                    javascript = str(javascript)
                    if ").setup({" in javascript:
                        # This script contains the stuff
                        files = re.search('(?<=sources: \[)(.*)(?=])',
                                          javascript).groups()[0]
                        files = "[" + files + "]"
                        files = json.loads(files)
                        for f in files:
                            quality = f['label']
                            link = f['file']
                            links_qual.append([link, quality])

            for l_q in links_qual:
                link = l_q[0]
                quality = l_q[1]
                host = link.split('//')[1].replace('www.', '').split('/')[0]
                info = ''
                sources.append({
                    'source': host,
                    'quality': quality,
                    'language': 'en',
                    'url': link,
                    'info': info,
                    'direct': True,
                    'debridonly': False
                })
            return sources
        except Exception as e:
            log_utils.log('EXCEPTION MSG: ' + str(e))
            return sources
Example #56
0
def init_global_variables(website):
  r = requests.get(website)
  #r = requests.get("https://www.autocarindia.com")
  output = r.text

  soup = BeautifulSoup(output, 'lxml')

  #The below two lines of code will extract the comments out of the code
  for element in soup(text=lambda text: isinstance(text, Comment)):
    element.extract()
  result = soup.findAll("html") #The result will point to the top node <html>

  G = nx.DiGraph() #Empty Graph with no nodes and no edges.

  G.add_node(result[0].name)  # result[0].name --> html
  parent = result[0]      #<html><head></head><body><div></div><p></p></body></html>
  parents = [parent]      #[<html><head></head><body><div></div><p></p></body></html>]
  labels=[parent.name]    #['html']
  edges = []
  i = 0
  for parent in parents:
    if hasattr(parent, 'contents'):
      for child in parent.contents:
        #These 2 lines will take out the extra string present as a node
        if isinstance(child, NavigableString):
          continue

        if child.name != None:
          node_name = child.name+str(i)
        else:
          node_name = 'string'+str(i)
        i = i + 1
        G.add_node(node_name)
        G.add_edge(parent.name,node_name)
        x = (parent.name,node_name)
        #print(parent.name,node_name)
        #print("")
        #print(str(child.name) + " ---> " + str(child.contents))
        toadd = ""
        for abc in child.contents:
          #print(str(type(abc)) +" -----> " + str(abc.string))
          if isinstance(abc, NavigableString):
            #print("TOADD ----> " +  str(abc.string))
            toadd = toadd + str(abc.string)
       
        if child.name != None:
          element1 = str(child.name) + ': ' + node_name
          child.name = node_name
        else:
          element1 = toadd
        if hasattr(child, 'attrs'):
          for item in child.attrs:
            # print(item,child.attrs[item])
            # input()
            element1 = element1 + '<br>' + '&nbsp; &nbsp;' + item+':' + '&nbsp;' + str(child.attrs[item])
        
        if child.string != None:
          element1 = element1 + '<br>' + '&nbsp; &nbsp;' + 'string'+':' + '&nbsp;' + str(child.string)
        elif toadd != "":
          element1 = element1 + '<br>' + '&nbsp; &nbsp;' + 'string'+':' + '&nbsp;' + toadd
        labels.append(element1)
        edges.append(x)
        parents.append(child)
        #print(parent.name,node_name)
  pos = nx.spiral_layout(G)

  # nx.draw(G,pos,with_labels=True, font_weight='bold')
  # print(parents)
  # plt.show()

  g=nx.Graph()
  g.add_nodes_from(parents)
  g.add_edges_from(edges) # E is the list of edges

  pos=nx.fruchterman_reingold_layout(g)

  # This part of code eliminates the extra nodes that are present in the graph
  # I didnt know if you want those extra nodes or not..

  N = len(parents) # ?
  counter = 0
  Xv = []
  Yv = []
  for k in pos.keys():
    if(counter>=N):
      Xv.append(pos[k][0])
      Yv.append(pos[k][1])
    counter+=1

  Xed=[]
  Yed=[]
  for edge in edges:
    Xed+=[pos[edge[0]][0],pos[edge[1]][0],None]
    Yed+=[pos[edge[0]][1],pos[edge[1]][1],None]




  trace3=Scatter(x=Xed,
                 y=Yed,
                 mode='lines',
                 line=dict(color='rgb(210,210,210)', width=1),
                 hoverinfo='text'
                 )
  trace4=Scatter(x=Xv,
                 y=Yv,
                 mode='markers',
                 name='net',
                 marker=dict(symbol='circle-dot',
                               size=5,
                               color='#6959CD',
                               line=dict(color='rgb(50,50,50)', width=0.5)
                               ),
                 text=labels,
                 hoverinfo='text'
                 )

  globals()['trace3']=trace3
  globals()['trace4']=trace4
  globals()['G']=G
  globals()['g']=g
  globals()['pos']=pos
  globals()['parents']=parents
  globals()['labels']=labels
  globals()['edges']=edges
  globals()['Xv']=Xv
  globals()['Yv']=Yv
Example #57
0
class AbstractScraper:
    class Decorators:
        """
        Define decorators for AbstractScraper methods here.
        """
        @staticmethod
        def schema_org_priority(decorated):
            """
            Use SchemaOrg parser with priority (if there's data in it)
            On exception raised - continue by default.
            If there's no data (no schema implemented on the site) - continue by default
            """
            @functools.wraps(decorated)
            def schema_org_priority_wrapper(self, *args, **kwargs):
                function = getattr(self.schema, decorated.__name__)
                if not function:
                    raise SchemaOrgException(
                        "Function '{}' not found in schema".format(
                            decorated.__name))

                if not self.schema.data:
                    return decorated(self, *args, **kwargs)

                try:
                    value = function(*args, **kwargs)
                except SchemaOrgException:
                    return decorated(self, *args, **kwargs)
                return value or decorated(self, *args, **kwargs)

            return schema_org_priority_wrapper

        @staticmethod
        def bcp47_validate(decorated):
            @functools.wraps(decorated)
            def bcp47_validate_wrapper(self, *args, **kwargs):
                tag = tags.tag(decorated(self, *args, **kwargs))
                return str(tag) if tag.valid else None

            return bcp47_validate_wrapper

        @staticmethod
        def default_exception_handling(decorated):
            """
            As web scraping is too unpredictable in nature, handle
            whatever exceptions may arise with defaulting values.

            If you wish to handle exceptions on your own you can pass the
            default_exception_handling=False flag.

            Example:
            from recipe_scrapers import scrape_me
            scraper = scrape_me('<recipe_url>', default_exception_handling=False)
            scraper.total_time()  # and etc.
            """
            @functools.wraps(decorated)
            def default_exception_handling_wrapper(self, *args, **kwargs):
                if self.default_exception_handling:
                    try:
                        return decorated(self, *args, **kwargs)
                    except:
                        on_exception_return = {
                            'title': '',
                            'total_time': 0,
                            'yields': '',
                            'image': '',
                            'ingredients': [],
                            'instructions': '',
                            'ratings': -1,
                            'reviews': None,
                            'links': [],
                            'language': 'en',
                        }
                        return on_exception_return.get(decorated.__name__)
                else:
                    return decorated(self, *args, **kwargs)

            return default_exception_handling_wrapper

    def __init__(self,
                 url,
                 test=False,
                 meta_http_equiv=False,
                 default_exception_handling=True):
        if test:  # when testing, we load a file
            with url:
                page_data = url.read()
        else:
            page_data = requests.get(url, headers=HEADERS).content

        self.default_exception_handling = default_exception_handling
        self.meta_http_equiv = meta_http_equiv
        self.soup = BeautifulSoup(page_data, "html.parser")
        self.schema = SchemaOrg(page_data)
        self.url = url
        # if self.schema.data:
        #     print("Class: %s has schema." % (
        #         self.__class__.__name__
        #     ))

    def url(self):
        return self.url

    def host(self):
        """ get the host of the url, so we can use the correct scraper """
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    @Decorators.schema_org_priority
    def title(self):
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    @Decorators.schema_org_priority
    def total_time(self):
        """ total time it takes to preparate the recipe in minutes """
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    @Decorators.schema_org_priority
    def yields(self):
        """ The number of servings or items in the recipe """
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    @Decorators.schema_org_priority
    def image(self):
        """
        Image of the recipe

        Try to fetch it from og:image if not implemented.
        """
        try:
            image = self.soup.find('meta', {
                'property': 'og:image',
                'content': True
            })
            return image.get('content')
        except AttributeError:  # if image not found
            raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    @Decorators.bcp47_validate
    @Decorators.schema_org_priority
    def language(self):
        """
        Human language the recipe is written in.

        May be overridden by individual scrapers.
        """
        candidate_languages = set()
        html = self.soup.find('html', {'lang': True})
        candidate_languages.add(html.get('lang'))

        # Deprecated: check for a meta http-equiv header
        # See: https://www.w3.org/International/questions/qa-http-and-lang
        meta_language = self.soup.find(
            'meta', {
                'http-equiv': lambda x: x and x.lower() == 'content-language',
                'content': True
            }) if self.meta_http_equiv else None
        if meta_language:
            for language in meta_language.get('content').split(','):
                candidate_languages.add(language)
                break

        # If other langs exist, remove 'en' commonly generated by HTML editors
        if len(candidate_languages) > 1 and 'en' in candidate_languages:
            candidate_languages.remove('en')

        # Return the first candidate language
        for language in candidate_languages:
            return language

    @Decorators.default_exception_handling
    @Decorators.schema_org_priority
    def ingredients(self):
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    @Decorators.schema_org_priority
    def instructions(self):
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    @Decorators.schema_org_priority
    def ratings(self):
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    def reviews(self):
        raise NotImplementedError("This should be implemented.")

    @Decorators.default_exception_handling
    def links(self):
        invalid_href = ('#', '')
        links_html = self.soup.findAll('a', href=True)

        return [
            link.attrs for link in links_html
            if link['href'] not in invalid_href
        ]
Example #58
0
    else:
        colors.error("Please enter the correct URL ")
        sys.exit(0)

    soup1=BeautifulSoup(html,"lxml")                    # Parsing the html data using BeautifulSoup

    title=getting_header(soup1)                         # Getting the title of the page
    data.header=title                                   # Storing title of the page as Project Title
    colors.success("Repository Title : "+title)
    time.sleep(1)

    star_value=0
    watch_value = 0
    fork_value =0

    a_tags=soup1.findAll("a")                           # Finding all the 'a' tags in response html data.
    for a_tag in a_tags:                                # Finding total stargazers of the repository
        string=a_tag.get("href")
        if(string.endswith("/watchers")):
            watch_value=(a_tag.get_text()).strip()
            watch_value=formated(watch_value)
            colors.success("Total watchers : "+watch_value)
            time.sleep(1)
            watch_value=int(watch_value)
        if(string.endswith("/stargazers")):
            star_value=(a_tag.get_text()).strip()
            star_value=formated(star_value)
            colors.success("Total stargazers : "+star_value)
            time.sleep(1)
            star_value=int(star_value)
        if(string.endswith("/members")):
Example #59
0
# url = "https://www.youracclaim.com/user/robert-mapstead"
print("\nThis program lists your badges from the Acclaim Badges Platform \n")
name = input("Enter user name. Example: robert-mapstead >>> ")
url = "https://www.youracclaim.com/user/" + name

# Getting the webpage, creating a Response object.
response = requests.get(url)

# Extracting the source code of the page.
data = response.text

# Passing the source code to Beautiful Soup to create a BeautifulSoup object for it.
soup = BeautifulSoup(data, 'lxml')

# Extracting all the <a> tags whose class name is 'result-title' into a list.
badges = soup.findAll('div', {'class': 'cr-standard-grid-item-content__title'})

# Extracting text from the the <a> tags, i.e. class badges.
print()
print(
    "Copy all of this text below OR copy the text from the badges.txt file on the left sidebar for use in your resume, CV, or Social Media Profiles: \n"
)

fobj = open('badges.txt', 'a')
#fobj.write(url+"\n")
fobj.write(url)
print("\nmy Badges:")
for badge in badges:
    #print(badge.text)
    print(badge.text.rstrip())
    with open('badges.txt', 'a') as f:
Example #60
0
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 9 22:24:08 2018

@author: TS
"""

#!/usr/bin/env python
# -*- coding: utf-8 -*-


import requests
from bs4 import BeautifulSoup
path='/home/tushar/Desktop/BTP/allparagraphs.txt' ####Enter the path to your file here
wiki_url='https://en.wikipedia.org/wiki/Narendra_Modi' #####Enter the wiki url here
source_code = requests.get(wiki_url).text
soup = BeautifulSoup(source_code,'html.parser')
a=soup.findAll('p')
allparagraphs=''
for i in a:
	allparagraphs=allparagraphs+i.text+'\n'
	#print(i.text)
#print(allparagraphs)
allparagraphsFile = open(path,'w')
allparagraphsFile.write(wiki_url[30:]+'\n')

allparagraphsFile.write(allparagraphs+'\n\n--------------------------------------')
allparagraphsFile.close()