Esempio n. 1
0
def search_wxc_cooking(url):
        global pages_wxc
        soup=get_soup_obj_ff('http://bbs.wenxuecity.com'+url)
        # print('ref stop='+'http://bbs.wenxuecity.com'+url)

        foodLinks=soup.find_all("a",href=re.compile("^(./1)"))
        for foodLink in foodLinks:
            if 'title' in foodLink.attrs:
                # print(foodLink.attrs['title'])
                # print(foodLink.text)
                if "四川" in foodLink.attrs['title']:
                    print(foodLink.attrs['title'])
                    print('Author='+foodLink.parent.find("a", {"class": "b"}).text)
                    #print('Author=' + foodLink.parent.find("a", id=re.compile("^(n_cooking_)")).__str__())
                    #for child in foodLink.parent.find(""):
                        #if 'text' in child.attrs:
                            #print(child.__str__())

        pageLinks = soup.find_all("a", href=re.compile("^(/cooking/\?page)"))
        for pageLink in pageLinks:
            # print(pageLink.attrs['href'])
            if pageLink.attrs['href'] not in pages_wxc:
                pages_wxc.add(pageLink.attrs['href'])
                print('New page found = '+pageLink.attrs['href'])
                search_wxc_cooking(pageLink.attrs['href'])
Esempio n. 2
0
def openHRef(url, urls, result):
    if (url in urls):
        return

    urls.append(url)

    bsObj = urlToBeautifulSoup(url)
    container = bsObj.find('div', {'id': 'mw-pages'})
    nextPage = container.find('a', {'href': re.compile("/w/.*#mw-pages")})
    anchors = container.findAll('a', {'href': re.compile("/wiki/.*")})

    for a in anchors:
        if a.has_attr('title') and a['title'] not in result:
            wikilink = '{}{}'.format(wikipedia, a['href'])
            bsWikilink = urlToBeautifulSoup(wikilink)
            wikidataUrl = bsWikilink.find(
                'a', {'href': re.compile('https://www.wikidata.org/wiki/Q')})
            if (wikidataUrl is None):
                continue
            wikidataHref = wikidataUrl['href']
            bsWikidata = urlToBeautifulSoup(wikidataHref)

            data = getData(bsWikidata)

            qid = 'Q{}'.format(wikidataHref.split('Q')[1])
            result[a['title']] = {
                'qid': qid,
                'title': a['title'],
                'href': wikilink,
                'data': data
            }

    if nextPage is not None:
        openHRef('https://es.wikipedia.org{}'.format(nextPage['href']), urls,
                 result)
Esempio n. 3
0
def crawl_rating(URL):
    movies_list = crawl_moveek(URL)
    for i in range(len(movies_list)):
        movie = movies_list[i]
        soup = get_URL("https://www.google.com/search?q=" + movie["title"])
        try:
            movie["imdb_URL"] = soup.find(
                href=re.compile("imdb"))["href"].strip("/url?q=").split("&")[0]
            movie["rotten_URL"] = soup.find(href=re.compile(
                "rotten"))["href"].strip("/url?q=").split("&")[0]
        except:
            pass
    return movies_list
Esempio n. 4
0
def test_reg_expression():
    url='http://www.pythonscraping.com/pages/page3.html'
    soup=get_soup_obj(url)
    # search <img src="../img/gifts/img3.jpg">
    images=soup.find_all("img", {"src": re.compile("\.\./img/gifts/img.\.jpg")})
    for image in images:
        print(image["src"])
    gifts=soup.find_all("tr",{"id": re.compile("gift.")})
    for gift in gifts:
        print(gift["id"])
    # test Lambda expressions
    # retrieves all tags that have exactly two attributes
    tags = soup.find_all(lambda tag: len(tag.attrs) == 2)
    for tag in tags:
        print(tag.attrs)
Esempio n. 5
0
def getData(bs):
    statementHeader = bs.find(id='claims').parent
    divContainer = statementHeader.find_next_sibling()
    divData = divContainer.findAll('div', {'id': re.compile('P[0-9]*')})
    properties = {}
    for data in divData:
        aProperty = data.find('a', {'title': re.compile('Property:P[0-9]*')})
        propertyId = aProperty['title'].split(':')[1]
        propertyName = aProperty.get_text()
        if 'image' in propertyName:
            aValue = [data.find('img')]
        else:
            aValue = data.findAll(
                'div', {
                    'class':
                    'wikibase-snakview-value wikibase-snakview-variation-valuesnak'
                })
        values = {}
        for a in aValue:
            if (a.name == 'img'):
                qValue = 'img'
                textValue = 'https:{}'.format(a['src'])
            else:  # this is a div
                if len(a.find_all()) > 0:
                    innerDataTitle = a.find_all(
                        'a', {'title': re.compile('Q[0-9]*')})
                    for idata in innerDataTitle:
                        if 'ikipedia' in idata.get_text():
                            continue
                        qValue = idata['title']
                        textValue = idata.get_text()
                    innerDataLink = a.find_all(
                        'a', {'class': re.compile('external free')})
                    for idata in innerDataLink:
                        if 'ikipedia' in idata.get_text():
                            continue
                        qValue = 'link'
                        textValue = idata.get_text()
                else:
                    qValue = 'value'
                    textValue = a.get_text()
            values[qValue] = textValue

        properties[propertyId] = {}
        properties[propertyId]['label'] = propertyName
        properties[propertyId]['values'] = values

    return properties
Esempio n. 6
0
def verify(url_add, writer):
    current_link = url_add
    next_link = ""
    last_link = ""

    while True:
        html = urlopen(current_link)
        bsObj = BeautifulSoup(html, "html.parser")
        pager_tags = bsObj.findAll("div", {"class": "Pager"})
        for pages_tag in pager_tags:
            if pages_tag:
                page_tags = pages_tag.findAll(
                    "a", {"href": re.compile("\/Shop-[a-zA-Z]{6}\/[0-9]*\/[a-zA-Z0-9]*\?page=[0-9]*")}
                )
                if page_tags:
                    next_page_tag = page_tags[-2]
                    last_page_tag = page_tags[-1]
                    next_link = "http://www.chemistwarehouse.com.au" + next_page_tag["href"]
                    last_link = "http://www.chemistwarehouse.com.au" + last_page_tag["href"]

        save_single_page(current_link, writer)

        if next_link == last_link:
            save_single_page(next_link, writer)
            break
        else:
            current_link = next_link
            next_link = ""
Esempio n. 7
0
def searchCodeAThon():
    searches = []
    page_start = raw_input("Enter start page: ")
    page_end = raw_input("Enter end page: ")
    num_users = raw_input("Enter number of searches: ")
    for num in range(0, int(num_users)):
        search_term = raw_input("Please enter search term: ")
        searches.append(search_term)
    for search in searches:
        for page_num in range(int(page_start), int(page_end)):
            page = requests.get(
                'http://www.tynker.com/tools/community?t=codeathon&v=published&s='
                + str(page_num),
                auth=(username, pswd))
            soup = BeautifulSoup(page.text, 'html.parser')
            projects = soup.find_all('li', class_="card")
            for proj in projects:
                if str(search) in proj.text:
                    print '\n'
                    print "Page: " + str(page_num + 1)
                    link = proj.find('a', href=re.compile('community-details'))
                    print 'http://www.tynker.com/tools/' + str(link['href'])
                    proj_name = proj.find('div', class_="card-title")
                    print proj_name.get_text()
                    print '\n'
Esempio n. 8
0
def getExternalLinks(bsObj, includeUrl):
    externalLinks = []
    for links in bsObj.find("a", href=re.compile("^(www|http)((?!" + includeUrl + ").)*$")):
        if links.attrs['href'] is not None:
            if links.attrs['href'] not in externalLinks:
                externalLinks.append(links.attrs['href'])
    return externalLinks
Esempio n. 9
0
    def _podcastParser(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        episodes = soup.find_all('div',
                                 attrs={'class': re.compile('episode ')})

        self.__dict__.clear()

        for x, episode in enumerate(episodes):
            episode_num = episode.find('span', attrs={
                'class': 'episode-num'
            }).get_text().strip('#')
            podcast_date = episode.find('h3').get_text()
            desc = episode.find('div', attrs={
                'class': 'podcast-content'
            }).get_text()

            title = episode.find('div', attrs={'class': 'podcast-details'})
            title = title.find('h3').get_text()

            podcast_mp3 = '%s%s.mp3' % (self.mp3_url, 'mmashow' + episode_num
                                        if 'mma' in title.lower() else 'p' +
                                        episode_num)

            dllinks = episode.find('ul', attrs={'class': 'download-links'})
            vimeo_link = dllinks.find('a')['href']

            self.__dict__[x] = {
                'episode': episode_num,
                'title': title,
                'date': podcast_date,
                'desc': desc.encode(),
                'mp3_url': podcast_mp3,
                'vimeo_url': vimeo_link
            }
        return self.__dict__
Esempio n. 10
0
def getInternalLinks(bsObj, includeUrl):
    internalLinks = []
    for links in bsObj.find("a", href=re.compile("^(/|.*" + includeUrl + ")")):
        if links.attrs['href'] is not None:
            if links.attrs['href'] not in internalLinks:
                internalLinks.append(links.attrs['href'])
    return internalLinks
Esempio n. 11
0
    def parse(self):
        soup = BeautifulSoup(self.source, 'html.parser')

        self.name = soup.find('h2').string
        raw_price = str(soup.find('h3'))
        loc = raw_price.index('$')
        self.price = raw_price[loc:loc + 7]
        self.distance = soup.find('span', text=re.compile('mi\)')).string
Esempio n. 12
0
 def f(item):
     title, url, lang = item
     page = spider.session.get(url)
     pat = re.compile("scope.code.{} = '(.+)'".format(lang))
     code = pat.findall(page.text)[0]
     jsoncode = json.loads('{"code": "%s"}' % code)
     codepath = set_save_path(title, lang)
     self.print_to_file(jsoncode['code'], codepath)
Esempio n. 13
0
 def executor(item):
     title, url, lang = item
     page = spider.session.get(url)
     pat = re.compile("scope.code.{} = '(.+)'".format(lang))
     code = pat.findall(page.text)[0]
     jsoncode = json.loads('{"code": "%s"}' % code)
     codepath = set_save_path(title, lang)
     self.print_to_file(jsoncode['code'], codepath)
def print_info(url_address): 
  html = urlopen(url_address)
  bsObj = BeautifulSoup(html,"html.parser")
  a_tags = bsObj.findAll("a",{"href":re.compile("\/buy\/[0-9]*\/[a-zA-Z0-9\-]*")})
  for a in a_tags:
    print(a["title"])
    price_tags = a.findAll("span",{"class":"Price"})
    for price in price_tags:
      if(price.get_text()):
        print(price.get_text().strip(" \t\n\r"))
Esempio n. 15
0
def scrape_links(url, soup):
    first = soup.find('a', attrs={'href': re.compile("/")})
    try:
        l1 = first.get('href')
        a = [first]
        links = [l1]
        n = 0
        length = len(soup.findAll('a',attrs={'href': re.compile("/")}))
        while n<(length-1):
            #extracting the relevant a tags and putting into an array
            blah = a[n].find_next('a', attrs={'href': re.compile("/")})
            a.append(blah)
            #extracting the href links from a and placing them in an array
            link = blah.get('href')
            clean_link = parse.urljoin(url, link)
            links.append(clean_link)
            n+=1
        return links
    except: 
        print("\nThis Link is not scrapable!!!!!")
Esempio n. 16
0
def getbirthchart(month, day, year, hour, minute, ampm, town, country, state= None):
    url = 'http://alabe.com/cgi-bin/chart/astrobot.cgi?INPUT1=&INPUT2=&MONTH=%d&DAY=%d&YEAR=%d&HOUR=%d&MINUTE=%d&AMPM=%s&TOWN=%s&COUNTRY=%s&STATE=%s&INPUT9=&Submit=Submit' % (month, day, year, hour, minute, ampm, town, country, state)
    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "html.parser")
    chart = soup.find_all(string=re.compile("Degree"))
    chartlist = []
    for line in chart:
        planet = line.split(None, 1)[0]
        sign = line.rsplit(None, 1)[-1]
        chartlist.append((planet, sign))
    print chartlist
Esempio n. 17
0
    def get_table(self, url):
        soup = self.get_soup(url)
        if soup.find(text=re.compile('available')):
            raise PageError('No Such Page', url)

        if '/tag/' in url:
            pat = re.compile(
                '"id".+?"(\d+)".+?"title".+?"(.+?)".+?"ac_rate".+?"(.+?)".+?"difficulty".+?"(.+?)"',
                re.S | re.X | re.U)
            raw_script = soup.body.find_all('script')[3].text
            table = []
            for data in pat.findall(raw_script):
                num, title, ac_rate, diff = data
                title, diff = BeautifulSoup(title), BeautifulSoup(diff)
                table.append(
                    (num, title.text, ac_rate, diff.text, title.a['href']))
        else:
            t = soup.find(id='problemList').find_all('tr')[1:]
            table = [tuple(i.stripped_strings) + (i.a['href'], ) for i in t]

        return table
Esempio n. 18
0
    def get_table(self, url):
        soup = self.get_soup(url)
        if soup.find(text=re.compile('available')):
            raise PageError('No Such Page', url)

        if '/tag/' in url:
            pat = re.compile(r'"id".+?"(\d+)".+?'
                             r'"title".+?"(.+?)".+?'
                             r'"ac_rate".+?"(.+?)".+?'
                             r'"difficulty".+?"(.+?)"', re.S | re.X | re.U)
            raw_script = soup.body.find_all('script')[3].text
            table = []
            for data in pat.findall(raw_script):
                num, title, ac_rate, diff = data
                title, diff = BeautifulSoup(title), BeautifulSoup(diff)
                table.append((num, title.text, ac_rate, diff.text, title.a['href']))
        else:
            tmp = soup.find(id='problemList').find_all('tr')[1:]
            table = [tuple(i.stripped_strings) + (i.a['href'],) for i in tmp]

        return table
Esempio n. 19
0
def crawl_moveek(URL):
    soup = get_URL(URL)
    movies = soup.find_all(href=re.compile("/phim/"))
    movies_list = list()

    for movie in movies:
        _movie = {}
        if movie.img:
            _movie["title"] = movie["title"]
            _movie["link"] = movie["href"]
            _movie["img"] = movie.img["data-src"]
            movies_list.append(_movie)
    return movies_list
Esempio n. 20
0
def get_links(url):
    global pages, pages_mit
    soup=get_soup_obj('http://www.pythonscraping.com/'+url)
    # search href, start with '/' means internal link
    # links=soup.find_all("a",{'href': re.compile("^(/node/|/blog|/)")})
    links = soup.find_all("a", {'href': re.compile("^(/)")})
    for link in links:
        # print(link.attrs['href'])
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage=link.attrs['href']
                pages.add(newPage)
                print(newPage)
                get_links(newPage)
def verify(url_add):	
  current_link = url_add
  next_link = ""
  last_link = ""
  
  html = urlopen(current_link)
  bsObj = BeautifulSoup(html,"html.parser")
  pager_tags =  bsObj.findAll("div",{"class":"Pager"})
  pages_tag = pager_tags[0]
  print(pages_tag)
  if(pages_tag):
    page_tags = pages_tag.findAll("a",{"href":re.compile("\/Shop-[a-zA-Z]{6}\/[0-9]*\/[a-zA-Z0-9]*\?page=[0-9]*")})
    if(page_tags):
      last_page_tag = page_tags[-1]
      last_link = "http://www.chemistwarehouse.com.au" + last_page_tag["href"]
      print(last_link)
  
  while True:
    html = urlopen(current_link)
    bsObj = BeautifulSoup(html,"html.parser")
    pager_tags =  bsObj.findAll("div",{"class":"Pager"})
    for pages_tag in pager_tags:
      if(pages_tag):
        page_tags = pages_tag.findAll("a",{"href":re.compile("\/Shop-[a-zA-Z]{6}\/[0-9]*\/[a-zA-Z0-9]*\?page=[0-9]*")})
        if(page_tags):
          next_page_tag = page_tags[-2]
          next_link = "http://www.chemistwarehouse.com.au" + next_page_tag["href"]
          
    print(next_link)
    print_info(current_link)
    
    if(next_link == last_link):
      print_info(next_link)
      break
    else:
      current_link = next_link
      next_link = ""
Esempio n. 22
0
        def save_defaultcode(soup, pdir, langlist):
            tag = soup.find(lambda x: x.has_attr('ng-init'))
            rawjson = tag['ng-init']
            pat = re.compile(r'(\[.+\])')
            raw = pat.findall(rawjson)[0].replace("'", '"')  # ' -> "
            raw = ''.join(raw.rsplit(',', 1))  # remove the last ',' in json list
            codelist = json.loads(raw)
            codelist = filter(lambda x: x['value'] in langlist, codelist)

            codedict = {i['value']: i['defaultCode'] for i in codelist}

            for lang in codedict.keys():
                codepath = os.path.join(pdir, self.SAVENAME[lang])
                if not os.path.isfile(codepath):
                    self.print_to_file(codedict[lang], codepath)
                elif self.DEBUG:
                    print('{} already exists!'.format(codepath))
Esempio n. 23
0
        def save_defaultcode(soup, pdir, langlist):
            tag = soup.find(lambda x: x.has_attr('ng-init'))
            rawjson = tag['ng-init']
            pat = re.compile('(\[.+\])')
            raw = pat.findall(rawjson)[0].replace("'", '"')  # ' -> "
            raw = ''.join(raw.rsplit(',',
                                     1))  # remove the last ',' in json list
            codelist = json.loads(raw)
            codelist = filter(lambda x: x['value'] in langlist, codelist)

            d = {i['value']: i['defaultCode'] for i in codelist}

            for lang in d.keys():
                codepath = os.path.join(pdir, self.SAVENAME[lang])
                if not os.path.isfile(codepath):
                    self.print_to_file(d[lang], codepath)
                elif self.DEBUG:
                    print('{} already exists!'.format(codepath))
Esempio n. 24
0
def crawl_rating_moveek(URL):
    movies_list = crawl_moveek(URL)
    for i in range(len(movies_list)):
        movie = movies_list[i]
        soup = get_URL("https://moveek.com" + movie["link"])
        movie["gerne"] = soup.find(class_="mb-0 text-muted text-truncate"
                                   ).string.strip().strip("-").strip()
        try:
            movie["description"] = soup.find(class_="mb-3 text-justify").text
        except:
            if "description" not in movie:
                soup = get_URL("https://moveek.com/" +
                               movie["link"].strip("/en"))
                movie["description"] = soup.find(
                    class_="mb-3 text-justify").text
        movie["rating"] = soup.find(href=re.compile("/review/")).text.strip()
        if movie["rating"] == "Reviews" or movie["rating"] == "Đánh giá":
            movie["rating"] = "No Review"
    return movies_list
Esempio n. 25
0
 def get_zones_dict(df):
     selected = df[columns]
     temp_df = df["gate_arrival_actual_timezone_code"]
     for i in range(1, len(columns)):
         temp_df = temp_df.append(selected[columns[i]], ignore_index=True)
     zones = [z.lower() for z in temp_df.unique() if "GMT" not in z]
     zones_dict = {}
     for zone in zones:
         url = 'https://www.timeanddate.com/time/zones/' + zone
         page = rq.get(url)
         content = page.content
         soup = BeautifulSoup(content, 'html.parser')
         scraped_zone = soup.find_all("ul", {"class": "clear"})
         if len(scraped_zone) > 0:
             p = re.compile(r'UTC [+-][0-9]{1,2}\b')
             search = p.search(scraped_zone[0].text)
             group = search.group(0)
             result = re.sub('[\s]', '', group)
             zones_dict[zone] = result.replace("UTC", "Etc/GMT")
     return zones_dict
Esempio n. 26
0
def scrape_game_data(url):
    # start browser
    browser = init_browser()
    # navigate to website
    browser.visit(url)
    # prepare soup for scraping
    html = browser.html
    soup = bs(html, 'html.parser')
    # find html text that contains the title of the table we want to scrape from
    target_title = soup.find_all(text=re.compile('Top Live'))
    # find the header that has our target title and extract the text
    target_h4 = soup.find('h4', text=target_title)
    table_title = target_h4.get_text()
    # find tables that are in the same parent tag
    tables = target_h4.parent.findAll('table')

    # extract data from table into lists
    streamers = []
    viewers = []
    for column in tables:
        streamer_list = column.find_all('a')
        for streamer in streamer_list:
            streamer_name = streamer.get_text()
            streamers.append(streamer_name)
        viewers_list = column.find_all('span')
        for streamer in viewers_list:
            viewership = streamer.get_text()
            viewers.append(viewership)
    viewers = [int(viewer.replace(',', '')) for viewer in viewers]

    # close browser
    browser.quit()

    # create dictionary for use in dataframe using lists created
    data_dict = {'Channel': streamers, 'Viewers': viewers}
    # create dataframe using dictionary created
    mytable = pd.DataFrame(data_dict)
    # set the channel as the index
    mytable.set_index('Channel', inplace=True)
    return mytable
def getLink(link):
#这个函数返回网页中的各个子链接
#fix bug on 7/12/2015:抓到的链接都不是实际的地址,比如抓到一个http://www.chnaus.com/thread-230167-1-1.html
#将这个地址填如浏览器,实际地址变成http://www.chnaus.com/thread-230167-1-1.html?_dsign=15b50450
#后面出现的那个东东,是由javascript生成的。因此如果将http://www.chnaus.com/thread-230167-1-1.html
#这个放入BeautifulSoup抓到的只是javascript脚本。

	return_link=[]
	try:	
		html = urlopen(link)
		bsObj= BeautifulSoup(html,'html.parser')
		#fix bug on 07/12/2015
		#thread\-[0-9]{6}\-[0-9]\-[0-9]\.html ---->thread\-[0-9]{6}\-[0-9]\-[0-9]+\.html
		#当page是1-9是一位数,当10-99是两位数,所以在后面弄个+
		#同时当爬到好几百页之后的时候,第一个[0-9]的组合有可能是5位,而不是六位
		#因此thread\-[0-9]{6}--->thread\-[0-9]{4,6}
		a_tags = bsObj.findAll('a',{'class':True,'onclick':True,'href':re.compile('http:\/\/www\.chnaus\.com\/thread\-[0-9]{4,6}\-[0-9]\-[0-9]+\.html')})
		if a_tags:
			for a_tag in a_tags:
				return_link.append(a_tag['href'])		
	finally:	
		return return_link
def verify(url_add):	
  current_link = url_add
  next_link = ""
  last_link = ""
  
  while True:
    html = urlopen(current_link)
    bsObj = BeautifulSoup(html,"html.parser")
    pager_tag =  bsObj.findAll("div",{"class":"Pager"})
    pages_tag = pager_tag.findAll("a",{"href":re.compile("\/Shop-Online\/[0-9]*\/\?page=[0-9]*")})
    next_page_tag = pages_tag[-2]
    last_page_tag = pages_tag[-1]
    next_link = "http://www.chemistwarehouse.com.au" + next_page_tag["href"]
    last_link = "http://www.chemistwarehouse.com.au" + last_page_tag["href"]
    
    print(current_link)
    
    if(next_link == last_link):
      print(next_link)
      break
    else:
      current_link = next_link
      next_link = ""
def getLinks(pageUrl):
    global pages  # make sure the global name is used
    html = urlopen("http://en.wikipedia.org" + pageUrl)
    bsObj = BeautifulSoup(html)
    # testing the structure of the pages
    try:
        print(bsObj.h1.get_text())
        print(bsObj.find(id="mw-content-text").findAll("p")[0])
        print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("Cannot find the main text or edit page")

    for links in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in links.attrs:
            if links.attrs['href'] not in pages:
                newPage = links.attrs['href']
                pages.add(newPage)
                print(newPage)
                getLinks(newPage)
        else:
            print('How is it possible?')
            print(links)
            print('How is it possible?')
Esempio n. 30
0
    def get_pic_request(self):
        print('Start the GET')
        r = self.requests(self.web_url)
        print('Start to find all the <img>')
        text = BeautifulSoup(r.text, 'lxml')
        all_images = text.find_all('img', alt=re.compile("keywords"))  # find all the img with a "keywords" 
        print('create file')
        self.create_folder(self.folder_path)
        print('change the current file to it')
        os.chdir(self.folder_path)  # change the path to the target.
        i = 0
        all_pics = self.get_files(self.folder_path)
        for img in all_images:
            name_start_pos = img.index('photo')
            name_end_pos = img.index('?')
            name = img[name_start_pos:name_end_pos] + '.jpg'

            if name in all_pics:
                print("this pic is already existing")
                continue
            else:
                print(img)
                self.save_img(img['src'], name)
                i +=1
Esempio n. 31
0
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import re

html = urlopen("http://www.chemistwarehouse.com.au/Shop-Online/587/Swisse")
bsObj = BeautifulSoup(html,"html.parser")
#print(bsObj.prettify())
#nameList = bsObj.findAll("span",{"class":"Price"})
a_tags = bsObj.findAll("a",{"href":re.compile("\/buy\/[0-9]*\/[a-zA-Z0-9\-]*")})
for a in a_tags:
  print(a["title"])
  price_tags = a.findAll("span",{"class":"Price"})
  for price in price_tags:
    if(price.get_text()):
      print(price.get_text().strip(" \t\n\r"))
      
Esempio n. 32
0
from bs4 import BeautifulSoup,re
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""	
soup = BeautifulSoup(html)
'''
#print soup.prettify()
print soup.a
print type(soup.a)
print soup.p.string
print soup.attrs
print soup.body.contents
for child in soup.body.children:
	print child

for child in soup.descendants:
    print child
'''
print soup.find_all(href=re.compile("elsie"), id='link1')
Esempio n. 33
0
 
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import re

html = urlopen("https://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html,"html.parser")

for link in bsObj.find("div",{"id":"bodyContent"}).findAll("a",href = re.compile("^(/wiki/)((?!:).)*$")):
  if "href" in link.attrs:
    print(link["href"])
  
Esempio n. 34
0
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import re


html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)
images = bsObj.findAll("img", {"src": re.compile("\.\./img/gifts/img.*\.jpg")})
for image in images:
    print(image["src"])
Esempio n. 35
0
def scrape_streamer_data(url):
    # start browser
    browser = init_browser()
    # navigate to website
    browser.visit(url)
    #click on month tab in performace section
    browser.find_by_text('Month').click()
    # prepare soup for scraping
    html = browser.html
    soup = bs(html, 'html.parser')

    # create empty dictionary to store data we scrape
    data_dict = {}

    # scrape for channel_name
    channel_name = soup.find('div', id='mini-profile').find('h4').get_text()
    data_dict['channel_name'] = channel_name

    # scrape for streamer's avg_viewers
    target_text = soup.find_all(text=re.compile('Avg viewers'))
    target_div = soup.find('div', text=target_text)
    avg_viewers = float(
        target_div.parent.find_all('span')[1].get_text().replace(',', ''))
    data_dict['avg_viewers'] = avg_viewers

    # scrape for streamer's time_streamed
    target_text = soup.find_all(text=re.compile('Hours streamed'))
    target_div = soup.find('div', text=target_text)
    time_streamed = float(
        target_div.parent.find_all('div')[2].get_text().replace(',', ''))
    data_dict['time_streamed(hrs)'] = time_streamed

    # scrape for streamer's all_time_peak_viewers
    target_text = soup.find_all(
        text=re.compile('Highest recorded number of concur. viewers'))
    target_div = soup.find('div', text=target_text)
    all_time_peak_viewers = float(
        target_div.parent.find_all('div')[2].get_text().replace(',', ''))
    data_dict['all_time_peak_viewers'] = all_time_peak_viewers

    # scrape for streamer's hours_watched
    target_text = soup.find_all(text=re.compile('Hours watched'))
    target_div = soup.find('div', text=target_text)
    hours_watched = float(
        target_div.parent.find_all('div')[2].get_text().replace(',', ''))
    data_dict['hours_watched'] = hours_watched

    # scrape for streamer's overall_rank
    target_text = soup.find_all(text=re.compile('RANK'))
    target_div = soup.find('span', text=target_text)
    overall_rank = float(
        target_div.parent.find_all('span')[1].get_text().replace(',', ''))
    data_dict['overall_rank'] = overall_rank

    # scrape for streamer's followers_gained
    target_text = soup.find_all(text=re.compile('Followers gained'))
    target_div = soup.find('div', text=target_text)
    followers_gained = float(
        target_div.parent.find_all('div')[2].get_text().replace(',', ''))
    data_dict['followers_gained'] = followers_gained

    # scrape for streamer's total_followers
    target_text = soup.find_all(text=re.compile('Total followers'))
    target_div = soup.find('div', text=target_text)
    total_followers = float(
        target_div.parent.find_all('div')[2].get_text().replace(',', ''))
    data_dict['total_followers'] = total_followers

    # scrape for streamer's total_views
    target_text = soup.find_all(text=re.compile('Total views'))
    target_div = soup.find('div', text=target_text)
    total_views = float(
        target_div.parent.find_all('div')[2].get_text().replace(',', ''))
    data_dict['total_views'] = total_views

    # close browser
    browser.quit()

    # create dataframe using dictionary created
    mytable = pd.DataFrame([data_dict])
    # set the channel as the index
    mytable.set_index('channel_name', inplace=True)
    return mytable
Esempio n. 36
0
DB = 'geipan'



mongo = MongoClient('mongodb://' + HOST)

db = mongo[DB]

cursor=db['Cas'].find({}, no_cursor_timeout=True).batch_size(20)
idCasFail = list()
for _cas in cursor:
    res = requests.get('http://www.cnes-geipan.fr/index.php?id=202&cas=' + _cas['cas_numEtude'])
    if res.status_code == 200:
        resContent = res.content
        soup = BeautifulSoup(resContent, 'html.parser')
        try:
            #links = soup.find("div", {"class": "tx-geipansearch-pi1"}).find_all('a')
            newObject = _cas;
            links = soup.find_all(href=re.compile("geipan-doc"))
            newObject['files'] = [{"name": _link.text, "link": _link.get('href')} for _link in links]
            print(str(_cas['_id']) + ' updated')
            db['Cas'].update_one({'_id': _cas['_id']}, {'$set': newObject})
        except:
            print("file not found -> append list")
            idCasFail.append(str(_cas['_id']))
    else:
        print("request fail -> append list")
        idCasFail.append(str(_cas['_id']))
cursor.close()
print(idCasFail)
Esempio n. 37
0
directory = "downloaded"
if not os.path.exists(directory):
    os.makedirs(directory)

course=input(" enter course you want to search")

html = urlopen("https://www.coursera.org/courses?query="+course)

print(course)
b=BeautifulSoup(html,'html.parser')
col=b.findAll("div",{"class":"card-info"})
cno=0

for i in col:
    course_name=i.find("",{"class":re.compile("\.*card-title\.*")}).get_text()
    print ("name:",course_name)
#    cno=cno+  1
    link="https://www.coursera.org"+i.find_parent("a" ).attrs['href']
    tag=i.find("",{"class":re.compile("\.*product-badge\.*")}).get_text()
    print ("badge:",tag)
    
    course_by=i.find("",{"class":re.compile("\.*card-description\.*")}).get_text()
    print ("by:",course_by)
    print ("link foR MORE  details :",link)
    if tag.lower() == "course":
        newx=BeautifulSoup(urlopen(link),'html.parser')
    
        free=-1
        duration=-1
        cost=-1
Esempio n. 38
0
__author__ = 'charlesw'
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import re

html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html, "html.parser")

nameList = bsObj.findAll("span", {"class":"green"})
#print(bsObj.findAll(id="text")) # this skip the name input, just checking on attr input
for name in nameList:
    print(name.get_text())


html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)
for child in bsObj.find("table",{"id":"giftList"}).children:
    print(child)

print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())

for i in bsObj.find("table",{"id":"giftList"}).findAll("img"):
    print (i.parent.previous_sibling.previous_sibling.previous_sibling.get_text() +' -- '+i.parent.previous_sibling.get_text())

images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:
    print(image["src"])
    print (image.parent.previous_sibling.previous_sibling.previous_sibling.get_text() +' -- '+image.parent.previous_sibling.get_text())
Esempio n. 39
0
def main(argv):
    url = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hu:o:", ["url=", "ofile="])
    except getopt.GetoptError:
        print('wikiannex.py -u <url> -o <outputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if (opt == '-h'):
            print('wikiannex.py -u <url> -o <outputfile>')
            sys.exit()
        elif opt in ("-u", "--url"):
            url = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        else:
            assert False, "unhandled option"

    bsObj = urlToBeautifulSoup(url)
    ids = bsObj.findAll('span', {'class': 'mw-headline'})
    elements = {}
    for id in ids:
        element = {}
        name = id.get_text()
        element['qid'] = ''
        element['title'] = name
        element['href'] = ''
        element['data'] = {}
        if (name == 'Referencias'):
            break
        parent = id.parent
        article = parent.find_next_sibling()
        if (article is not None and article.name == 'div'):
            articleHref = article.find('i').find('a')['href']
            wikilink = '{}{}'.format(wikipedia, articleHref)
            element['href'] = articleHref
            nextNode = parent.find_next_sibling()
        while True:
            if (nextNode.name == 'ul'):
                break
            elif (nextNode.name == 'h2'):
                break
            nextNode = nextNode.find_next_sibling()
            continue

        # if (nextNode.name == 'h2'):
        #     nextNode = parent.find_next_sibling()
        #     element['media'] = {}
        #     counter = 0
        #     while True:
        #         if (nextNode.name == 'p'):
        #             element['media']["p{}".format(counter)] = nextNode.get_text()
        #             counter = counter+1
        #         elif (nextNode.name == 'h2'):
        #             break
        #         nextNode = nextNode.find_next_sibling()
        #         continue

        # else:
        #     li = nextNode.findAll('li')
        #     actives = {}
        #     for l in li:
        #         text = (l.get_text())
        #         if ':' in text:
        #             category, other = text.split(':', 1)
        #             actives[category.strip()] = other
        #     element['media'] = actives

        href = element.get('href', False)
        if href:
            bsWikilink = urlToBeautifulSoup(wikilink)
            wikidataUrl = bsWikilink.find(
                'a', {'href': re.compile('https://www.wikidata.org/wiki/Q')})
            if (wikidataUrl is None):
                continue
            wikidataHref = wikidataUrl['href']
            bsWikidata = urlToBeautifulSoup(wikidataHref)

            data = getData(bsWikidata)

            qid = 'Q{}'.format(wikidataHref.split('Q')[1])
            element['qid'] = qid
            element['data'] = data
        elements[element['title']] = element
    print(elements)
Esempio n. 40
0
        outpic.write(imageData)
    if os.stat(ImageUrl.split("/")[-1]).st_size > 0:
        print("saved image:", ImageUrl.split("/")[-1])
    elif retry_count < 3:
        print(ImageUrl.split("/")[-1], "size is 0. imageData size:", len(imageData), ". Retrying...")
        sleepCountdown(2, 1)
        savePic(ImageUrl, retry_count = retry_count + 1)
    else:
        print(ImageUrl.split("/")[-1], "failed. Giving up.")

url = "https://www.onemotoring.com.sg/content/onemotoring/home/driving/traffic_information/traffic-cameras/woodlands.html"

Cameras = {'4703': 'Tuas Second Link',
           '4713': 'Tuas Checkpoint',
           '2701': 'Woodlands Causeway Towards Johor',
           '2702': 'Woodlands Checkpoint (Towards BKE)'}

time_interval = 55 # in seconds

while True:
    print("getting data...")
    r = requests.get(url=url)
    images = BeautifulSoup(r.content, "lxml").find_all(src = re.compile("mytransport"))    # get the image links

    print("Image links obtained. Length", len(images))

    for image in images:
        savePic("http:" + image["src"])    # save the image

    print("Waiting for", time_interval, "seconds... Press Ctrl+C to stop.")
    sleepCountdown(time_interval, 2)
Esempio n. 41
0
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
b=BeautifulSoup(html,'html.parser')


for child in b.find("table",{"id":"giftList"}).children:
    print(child)
    
print("888888888888888888888888888888888888888888888888888888888888888888888888888888")

for child in b.find("table",{"id":"giftList"}).descendants:
    print(child)
    
print ("1111111111111111111111111111111111111111111111")    
    
for sibling in b.find("table",{"id":"giftList"}).tr.next_siblings:
    print(sibling)    

print ("1111111111111111111111111111111111111111111111")    
    
for sibling in b.find("table",{"id":"giftList"}).tr:
    print(sibling)  
      
print(b.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())

images = b.findAll("img", {"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
    print(image["src"])
    print(image.attrs["src"])
    print(image["src"])
    
b.findAll(lambda tag: len(tag.attrs) == 2)
Esempio n. 42
0
# coding: utf-8

# In[ ]:

from urllib.request import urlopen
from bs4 import BeautifulSoup, re

html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs_obj = BeautifulSoup(html)

#links = bs_obj.findAll('a')
body_content = bs_obj.find("div", {"id": "bodyContent"})
links = body_content.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

for link in links:
    if 'href' in link.attrs:
        print(link.attrs['href'])