Python bs Examples, BeautifulSoup.bs Python Examples

Example #1

0

Show file

File: net_getOwnLocation.py Project: Beercow/malware-crawler

 def wieistmeineip(self):
     result = {}
     # Save original socket
     originalSocket = socket.socket
     # Set TOR Socks proxy
     commonutils.setTorProxy()
     
     try: 
         # Load 
         soup = self.parse("http://www.wieistmeineip.de")
         location = soup.findAll("div", { "class" : "location" })[0]
         location = bs(location.text, convertEntities=bs.HTML_ENTITIES)
         
         ip = soup.findAll('div', id='ipv4')[0]
         raw_ip = bs(ip.text, convertEntities=bs.HTML_ENTITIES)
         pattern = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}')
         ip = re.search(pattern, raw_ip.text)    
         
         result["ipaddress"] = ip.group(0)
         result["country"] = str(location)
     finally:
         # Removing SOCKS Tor Proxy 
         socket.socket = originalSocket 
         
     return result

Example #2

0

Show file

File: xml_parser7.py Project: zuz/systems_biology_bioinfo

    def get_text_from_paragraphs(self, paragraphs_list, bs_doc):
        """
	Returns a list of elements corresponding to list of words from particular section.
	"""
        words_in_paragraph_list = []
        i = 0
        for paragraph in paragraphs_list:
            try:
                pmid_in_bracket = bs(str(bs_doc.findAll(attrs={"pub-id-type": "pmid"})[0])).findAll(text=True)
            except:
                pmid_in_bracket = bs(str(bs_doc.findAll(attrs={"pub-id-type": "pmc"})[0])).findAll(text=True)
            pmid = str(pmid_in_bracket[0].encode("utf-8"))
            words_in_one_paragraph = []
            # print paragraph
            for text in paragraph.findAll(text=True):
                words = text.split()
                encoded = self.encode_list(words)
                cleaned = self.remove_digits(encoded)
                lowered = self.lower_words(cleaned)
                replaced = self.replace_punc(lowered)
                stems = self.into_stems(replaced)
                for word in stems:
                    words_in_one_paragraph.append(word)
            i += 1
            words_in_paragraph_list.append((pmid, i, words_in_one_paragraph))
        return words_in_paragraph_list

Example #3

0

Show file

    def wieistmeineip(self):
        result = {}
        # Save original socket
        originalSocket = socket.socket
        # Set TOR Socks proxy
        commonutils.setTorProxy()

        try:
            # Load
            soup = self.parse("http://www.wieistmeineip.de")
            location = soup.findAll("div", {"class": "location"})[0]
            location = bs(location.text, convertEntities=bs.HTML_ENTITIES)

            ip = soup.findAll('div', id='ipv4')[0]
            raw_ip = bs(ip.text, convertEntities=bs.HTML_ENTITIES)
            pattern = re.compile(
                '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}')
            ip = re.search(pattern, raw_ip.text)

            result["ipaddress"] = ip.group(0)
            result["country"] = str(location)
        finally:
            # Removing SOCKS Tor Proxy
            socket.socket = originalSocket

        return result

Example #4

0

Show file

File: blogger_image_grab.py Project: dattasaurabh82/blogger_image_grab

def main(url):
    """
    blogger_image_grab.py
        Downloads all the images on the supplied Blogger blog, and saves them to the
        Downloads directory

    Usage:
        python blogger_image_grab.py http://example.com
    """

    # send the request with a random user agent in the header
    request = urllib2.Request(url, None, randomize_user_agent())
    html = urllib2.urlopen(request)
    soup = bs(html)
    parsed = list(urlparse.urlparse(url))
    download_images(soup, parsed)

    older_posts = soup.find(text='Older Posts')
    while older_posts:
        print 'Navigating to the next page: %s' % older_posts.previous['href']
        soup = bs(urlopen(older_posts.previous['href']))
        parsed = list(urlparse.urlparse(url))
        download_images(soup, parsed)
        older_posts = soup.find(text='Older Posts')
        if not older_posts:
            print 'Downloading complete!'

Example #5

0

Show file

File: portertitles.py Project: jackdreilly/Archimedes-Level

def porterScrape():
	"""docstring for porterScrape"""
	storer = {}
	
	data = urlopen(jeans).read()

	b = bs(data)
	designers = b.findAll('div',{'class':'designer'})
	
	for d in designers:
		pid = int(d.find('a').get('href')[9:])
		brand = d.find('span',{'class':'product-designer'}).text
		title = d.find('span',{'class':'product-title'}).text
		newd = dict(brand = brand, title = title)
		storer[pid] = newd
		
	data = urlopen(trousers).read()

	b = bs(data)
	designers = b.findAll('div',{'class':'designer'})

	for d in designers:
		pid = int(d.find('a').get('href')[9:])
		brand = d.find('span',{'class':'product-designer'}).text
		title = d.find('span',{'class':'product-title'}).text
		newd = dict(brand = brand, title = title)
		storer[pid] = newd
	return storer

Example #6

0

Show file

File: wenku8.py Project: zqpm/ACG

    def __init__(self, lnv, fec):
        self.lnv      = wenku8.WENKU8_PREFIX + wenku8.LNV_LIST[lnv]
        self.fec      = fec
        self.req      = urllib2.Request(self.lnv,'', wenku8.OPERA_X_H)
        self.response = urllib2.urlopen(self.req)
        # use BeautifulSoup
        self.soup     = bs(self.response)

        # chapter/volume counter
        ctr_chp = 0
        #ctr_vol = -1
        self.chps = list()
        self.tsp = bs()

        fl = open('list' + '_' + self.fec + '.txt','w')
        for atr in self.soup.body.findAll('tr'):
            tsp = bs(atr.text)
            if tsp.a != None:
                ctr_chp += 1
                sn = unicode(ctr_chp).rjust(3,'0')
                lk = tsp.a['href']
                title = unicode(tsp.contents[0])[:-1]
                self.chps.append((sn,lk,title))
                fl.write(sn.encode(fec, 'ignore') + '.txt' + ' ' \
                         + title.encode(fec, 'ignore') + '\n')
        tsp.close()
        fl.close()
        self.soup.close()
        return

Example #7

0

Show file

File: scrape_bloomberg_company_names.py Project: EmilyWebber/givinggraph

def main():

    # Bloomberg has these pages on which all company names are listed on
    PAGES = ['0-9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
             'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'other']

    # Initialize collector
    collectorTitle = ['TICKER', 'NAME', 'EXCHANGE', 'WEBSITE', 'INDUSTRY', 'SECTOR', 'SUMMARY']
    execCollectorTitle = ['TICKER', 'NAME', 'EXCHANGE', ('POSITION TITLE', 'EXECUTIVE\'S NAME')]

    myCSVwriter(CSVNAME, collectorTitle, 1)
    myCSVwriter(EXECSCSVNAME, execCollectorTitle[0:3] + [elt for tup in execCollectorTitle[3] for elt in tup], 1)

    for p in PAGES:
        soup = bs(urllib2.urlopen('http://www.bloomberg.com/markets/companies/a-z/' + p + '/'))

    # Get remaining pages in the $p category,  then loop over scraping those
        try:
            rem_pages = [str(i['href']) for i in soup.find('div', {'class': 'dictionary_pagination'}).findAll('a')]
        except:
            rem_pages = []
            getPageData(soup)
            # print 'Finished 1st page of ' + p

        # Collect data on remaining pages of $p
        for r in rem_pages:
            getPageData(bs(urllib2.urlopen('http://www.bloomberg.com' + r)))

Example #8

0

Show file

File: scrape.py Project: legoktm/legoktm

def classes(text):
    soup = bs(text)
    search = soup.body.findAll('td', attrs={'align': 'right'})
    for row in search:
        if row.br:
            set = str(row)
    soup = bs(set)
    grade = soup.td.contents[0]
    return grade.split('%')[0]

Example #9

0

Show file

File: scrape.py Project: legoktm/legoktm

def classes(text):
	soup = bs(text)
	search = soup.body.findAll('td',attrs={'align':'right'})
	for row in search:
		if row.br:
			set = str(row)
	soup = bs(set)
	grade = soup.td.contents[0]
	return grade.split('%')[0]

Example #10

0

Show file

File: BabySteps.py Project: Jhanani/IIS-Healthpoint

	def __search_results(self, page):
		start = time.time()
		if page == 1:
			results = bs(urlopen(baseURL + queryString + self.searchTerm), parseOnlyThese = ss('a','result_primary_link'))
		else:
			results = bs(urlopen(baseURL + queryString + self.searchTerm + searchPageString + str(page)), parseOnlyThese = ss('a','result_primary_link'))
		for link in results.contents:
			if link['result-type'] == 'Talk' and not link['href'] in self.listOfPosts:
				Investigator.__result(self, link['href'])
		print "__search_results Elapsed Time: %s" % (time.time() - start), self.searchTerm, ' page: ', page

Example #11

0

Show file

File: extract_2.py Project: ttoyama/prog_ISN-ACCTS

def make_trial_soup():
    xml = ''.join(f_data)
    soup  = bs(xml)
    ssoup = bss(xml)

    trial_soup = [] #each item of list is BeautifulSoup
    for i in ssoup('trial'):
        j = bs(str(i))
        trial_soup.append(j)
    return trial_soup

Example #12

0

Show file

File: ContentRegistryPingger.py Project: eea/Products.Reportek

    def content_registry_pretty_message(cls, message):
        messageBody = ''
        try:
            if '<html' in message:
                messageBody = bs(message).find('body').text
            elif '<?xml' in message:
                messageBody = bs(message).find('response').text
        except:
            messageBody = message

        return messageBody

Example #13

0

Show file

File: cs_xml_report_compare.py Project: coxchen/cs_xml_report

def main():
    oldXmlReport = sys.argv[1]
    newXmlReport = sys.argv[2]
    oldSoup = bs(file(oldXmlReport).read(), convertEntities=bs.HTML_ENTITIES)
    newSoup = bs(file(newXmlReport).read(), convertEntities=bs.HTML_ENTITIES)
    
    begin = time.time()
    comparator = create_comparator(oldSoup, newSoup)
    end = time.time()
    print '\n# took {0:.2f} secs to build the stats ...\n'.format(end - begin)
    
    comparator.show_options()

Example #14

0

Show file

File: cs_xml_report_compare.py Project: coxchen/cs_xml_report

def main():
    oldXmlReport = sys.argv[1]
    newXmlReport = sys.argv[2]
    oldSoup = bs(file(oldXmlReport).read(), convertEntities=bs.HTML_ENTITIES)
    newSoup = bs(file(newXmlReport).read(), convertEntities=bs.HTML_ENTITIES)

    begin = time.time()
    comparator = create_comparator(oldSoup, newSoup)
    end = time.time()
    print "\n# took {0:.2f} secs to build the stats ...\n".format(end - begin)

    comparator.show_options()

Example #15

0

Show file

File: scrape.py Project: legoktm/legoktm

def schedule(text):
	soup = bs(text)
	l = soup.body.table.findAll('td', attrs={'class':'scheduleBody'})
	final = []
	for row in l:
		if 'portal' in str(row):
			if row:
				sp = bs(str(row))
				url = sp.a['href']
				name = sp.a.b.contents[0]
				final.append({'url':url,'name':name})
	return final

Example #16

0

Show file

File: scrape.py Project: legoktm/legoktm

def schedule(text):
    soup = bs(text)
    l = soup.body.table.findAll('td', attrs={'class': 'scheduleBody'})
    final = []
    for row in l:
        if 'portal' in str(row):
            if row:
                sp = bs(str(row))
                url = sp.a['href']
                name = sp.a.b.contents[0]
                final.append({'url': url, 'name': name})
    return final

Example #17

0

Show file

File: download_songs_from_songspk.py Project: piyusgupta/pyscripts

 def get_song_list(self):
     if self.list_of_url:
         for line in open(self.file):
             next_url = line.strip()
             soup = bs(self._get_content(next_url))
             self.log.debug(
                 "Q-length : %d, Parsing URL : %s" %
                 (self.command_queue.qsize(), next_url))
             yield self.parse_html(soup)
     else:
         soup = bs(self._get_content())
         yield self.parse_html(soup)

Example #18

0

Show file

File: default.py Project: natko1412/natko-xbmc

def get_video_from_part_link(part_link):
    reg='file: "(.+?)"'
    pattern=re.compile(reg)



    basic_url='http://nbahd.com/'
    req = urllib2.Request(url=part_link,headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
    request=urllib2.urlopen(req)
    html=request.read()
    soup=bs(html)
    try:
        tag=soup.findAll('div',{'class':'page-content rich-content'})[0]
    except:
        tag=soup.findAll('div',{'class':'entry-content rich-content'})[0]


    tag=tag.findAll('iframe')[0]
    url=tag['src']
    url=basic_url + url
    request=urllib2.urlopen(url)
    html=request.read()
    soup=bs(html)

    try:
        video_tag=re.findall(pattern,html)
        my_addon = xbmcaddon.Addon()
        HD = my_addon.getSetting('quality')
        
        if HD=='false':
            ind=1
        else:
            ind=0
        
        src=video_tag[ind]
    except:
        video_tag=soup.findAll('video')[0]
        my_addon = xbmcaddon.Addon()
        HD = my_addon.getSetting('quality')
        
        if HD=='false':
            ind=1
        else:
            ind=0
        tag=video_tag.findAll('source')[ind]
        src=tag['src']


        
        
    
    return(src)

Example #19

0

Show file

File: default.py Project: natko1412/natko-xbmc

def get_video_from_part_link(part_link):
    reg = 'file: "(.+?)"'
    pattern = re.compile(reg)

    basic_url = 'http://nbahd.com/'
    req = urllib2.Request(
        url=part_link,
        headers={
            'User-Agent':
            ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
        })
    request = urllib2.urlopen(req)
    html = request.read()
    soup = bs(html)
    try:
        tag = soup.findAll('div', {'class': 'page-content rich-content'})[0]
    except:
        tag = soup.findAll('div', {'class': 'entry-content rich-content'})[0]

    tag = tag.findAll('iframe')[0]
    url = tag['src']
    url = basic_url + url
    request = urllib2.urlopen(url)
    html = request.read()
    soup = bs(html)

    try:
        video_tag = re.findall(pattern, html)
        my_addon = xbmcaddon.Addon()
        HD = my_addon.getSetting('quality')

        if HD == 'false':
            ind = 1
        else:
            ind = 0

        src = video_tag[ind]
    except:
        video_tag = soup.findAll('video')[0]
        my_addon = xbmcaddon.Addon()
        HD = my_addon.getSetting('quality')

        if HD == 'false':
            ind = 1
        else:
            ind = 0
        tag = video_tag.findAll('source')[ind]
        src = tag['src']

    return (src)

Example #20

0

Show file

File: iptoUrl.py Project: helderfernandes1279/webscriptscanner

def get_urls_robtex(ip):
 request = urllib2.Request("http://ip.robtex.com/%s.html" % ip)
 request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
 robtex = bs(urllib2.urlopen(request))
 websiteslist = []
 tmp=robtex.findAll("span",{"id":re.compile("dns")})

 a=bs(str(tmp))

 for url in a('a'):
  if not url.string is None:
   websiteslist.append("http://"+url.string)
 
 return websiteslist

Example #21

0

Show file

File: steam.py Project: liato/spiffy

    def update(self):
        if self.steamalias:
            data = urllib2.urlopen('http://steamcommunity.com/id/%s' % self.userid).read()
        else:
            data = urllib2.urlopen('http://steamcommunity.com/profiles/%s' % self.userid).read()
            
            
        data = bs(data, convertEntities=bs.HTML_ENTITIES)
        try:
            self.username = data.find(id="mainContents").h1.contents[0].strip()
        except Exception:
            return
        try:
            self.status = data.find(id='statusOnlineText').string
            self.status = 1
        except Exception:
            pass
        if not self.status:
            try:
                self.game = data.find(id='statusInGameText').string.strip()
                self.status = 2
            except Exception:
                pass
        if not self.status:
            try:
                if data.find('p', 'errorPrivate'):
                    self.status = 3
            except Exception:
                pass
        if not self.status:
            try:
                self.lastseen = data.find(id='statusOfflineText').string.replace('Last Online: ',"")
                self.status = 0
            except Exception:
                pass

        if self.status == 2: # The user is in-game, retrieve the ip if possible
            try:
                friendurl = data.find(id='friendBlocks').div.div.div.a['href']
            except Exception:
                return
            
            friendurl = friendurl +'/friends'
            data = data = urllib2.urlopen(friendurl).read()
            data = bs(data, convertEntities=bs.HTML_ENTITIES)
            try:
                self.server = data.find('a', text=self.username).parent.parent.span.find('a')['href'][16:]
            except Exception:
                pass

Example #22

0

Show file

 def __search_results(self, page):
     start = time.time()
     if page == 1:
         results = bs(urlopen(baseURL + queryString + self.searchTerm),
                      parseOnlyThese=ss('a', 'result_primary_link'))
     else:
         results = bs(urlopen(baseURL + queryString + self.searchTerm +
                              searchPageString + str(page)),
                      parseOnlyThese=ss('a', 'result_primary_link'))
     for link in results.contents:
         if link['result-type'] == 'Talk' and not link[
                 'href'] in self.listOfPosts:
             Investigator.__result(self, link['href'])
     print "__search_results Elapsed Time: %s" % (
         time.time() - start), self.searchTerm, ' page: ', page

Example #23

0

Show file

File: ffcs.py Project: sair770/FFCS-Downloader

def crs():
    for x in range(0,len(list_subject[0])):
        print str(x)+'|'+list_subject[1][x]+"|"+list_subject[2][x]+"|"+list_subject[3][x]
    select=raw_input("select the subect by entering a the serial number and if you want to choose faculty other than the one you registered add \"d\" at the end (eg: \"2d\" without the quotes)")
    if select[-1]=='d':
        select=int(select[0])
        ch=facchoice(select)
        ch=ch.split(' - ')
        crsp=crpg%(list_subject[1][select],list_subject[3][select],ch[0])
        fac=ch[1]
    else:        
        crsp=crpg%(list_subject[1][int(select)],list_subject[3][int(select)],list_subject[6][int(select)])
        fac=list_subject[5][int(select)]
    url=baseurl+crsp
    k=br.open(url)
    hh = k.read()
    souph = bs(hh)
    input = souph.findAll('input')
    name2 = input[2].get('name')
    name3 = input[3].get('name')
    name4 = input[4].get('name')
    value2 = input[2].get('value')
    value3 = input[3].get('value')
    value4 = input[4].get('value')
    formdat = { name2 : value2,name3 : value3,name4 : value4}
    data_encoded = urllib.urlencode(formdat)
    response = br.open('https://vtop.vit.ac.in/student/coursepage_view3.asp', data_encoded)
    htmla=response.read()
    soupa=bs(htmla)
    lst=[h for h in soupa.findAll('a')]
    link=[each.get('href') for each in lst]
    for x in link :
        try:
            u = urllib2.urlopen(baseurl+"/"+x) #Testing if link is valid and downloadable
        except:
            link.remove(x)
    
    path_folder=createfold(list_subject[1][int(select)]+'-'+list_subject[2][int(select)],fac)
    for x in range(0,len(link)):
        try:
            url = link[x]
            file_nam = lst[x].text
            ##print file_nam
            down(url,file_nam,path_folder)
        except Exception,e:
            print e
            pass            
        print '*'*79

Example #24

0

Show file

def main():
    # Scrape all the comic data. There are 9 chapters with < 150 pages each
    comic = {}
    for chapter in range(10):
        for page in [str(x).zfill(2) for x in range(150)]:
            try:
                res = requests.get(
                    'http://www.casualvillain.com/Unsounded/comic/ch0%d/ch0%d_%s.html'
                    % (chapter, chapter, page))
                if res.ok:
                    quiet = comic.setdefault(chapter, {})
                    comic[chapter][int(page)] = res.content
            except:
                continue

    # I know that since the indexes are ints, they are likely to be in order,
    # but sorting to be on the safe side.
    chapters = comic.keys()
    chapters.sort()

    for chapter in chapters:
        data = comic[chapter]
        pages = data.keys()
        pages.sort()
        if len(pages) != max(pages):
            print 'Missing pages from chapter %d' % chapter
        for page in pages:
            try:
                page_data = data[page]
                soup = bs(page_data)
                this_page = 'ch%s_%s.html' % (str(chapter).zfill(2),
                                              str(page).zfill(2))
                try:
                    next_link = soup.findAll("a",
                                             {"class": "forward"})[0]['href']
                except:
                    # At the end of chapters there is no link.
                    next_link = 'ch%s_%s.html' % (str(chapter + 1).zfill(2),
                                                  str(1).zfill(2))
                comic_element = soup.find("div", {"id": "comic"}).find('img')
                link_soup = bs('<a href="%s">' % next_link)
                link_soup.find('a').insert(0, comic_element.extract())
                soup.find("div", {"id": "comic"}).insert(0, link_soup)
                prettyHTML = soup.prettify()
                with open(this_page, 'w') as f:
                    print >> f, prettyHTML
            except Exception as exc:
                print 'Bad chapter %d page %d %s' % (chapter, page, repr(exc))

Example #25

0

Show file

File: archives.py Project: johneddcooper/forumscraper

def get_urls(br):
    raw_urls = []
    src = br.page_source
    soup = bs(src)
    for a in soup.findAll('a', href=True):
        raw_urls.append(urlparse.urljoin(archive_link, a['href']))
    return raw_urls

Example #26

0

Show file

    def parse_item(self, response):
        '''
        访问各新闻页面，获取各键值
        :param response:
        :return:
        '''
        logUtil.getLog().info('news url :%s' % response.url)

        item = FenghuoItem()
        root = bs(response.body)
        item['topPost'] = "1"
        item["site_id"] = "13"
        item['website_id'] = ''
        item["site_name"] = '通山县机构编制网'
        item["area"] = "958"
        item["site_weight"] = "2"
        item['countryid'] = "1156"
        item['province'] = "1673"
        item['city'] = "136"
        item["ip"] = socket.gethostbyname("www.tsxbb.gov.cn")
        item["site_url"] = "www.tsxbb.gov.cn"
        item["forumurl"] = response.meta['forumurl']
        item["site_cls"] = '1'
        item["url"] = response.url
        item["subname"] = root.find("span", attrs={
            "class": "text14h"
        }).find("a", attrs={
            "href": "../"
        }).text
        item["title"] = root.find("td", attrs={"class": "textbiaoti"}).text
        str = root.find("td", attrs={"class": "text12hui"}).text
        str = str[str.index('20'):]
        item["pubdate"] = str[:str.index('&nbsp;') - 1]
        try:
            str = str[str.index('su = ') + 6:]
            item["website_id"] = str[:str.index(';') - 1]
        except:
            item["website_id"] = ""
        styles = root.find("div", attrs={
            "class": "TRS_Editor"
        }).findAll("style")
        for style in styles:
            style.clear()
        #替换所有图片标签
        imgs = root.find("div", attrs={"class": "TRS_Editor"}).findAll("img")
        for img in imgs:
            img.replaceWith(img.prettify())

        item["txt"] = root.find("div", attrs={
            "class": "TRS_Editor"
        }).text.replace("\r\n",
                        "$*huanhang*$").replace("\n", "$*huanhang*$").replace(
                            "\"", "'").replace("<br />", "$*huanhang*$")
        item["txt_len"] = len(item["txt"])
        item["domain_1"] = "tsxbb.gov.cn"
        item["domain_2"] = ""
        item["snatch_time"] = datetime.datetime.now().__format__("")

        item["task_id"] = response.meta['task_id']
        self.saveData.saveContext(item)

Example #27

0

Show file

File: get_album_genre2.py Project: inyeoplee77/DataScienceProject

def search_album(singer, song):
	url = 'http://www.allmusic.com/search/song/' + singer + '%20' + song
	i=0
	for x in range(10) :
		try:
			website_html = requests.get(url).text
			soup = bs(website_html)
		except requests.exceptions.RequestException as e:
			i+=1
			continue
		break
	if i>=9:
		return -1

	
	compare = 0
	temp = []
	for line in soup.findAll("li",{'class':"song"}):
		
		for par in line.findAll("div"):
			if(par.get('class')=="title" and par.text.replace('\"','').lower()==song.lower()):
				compare += 1
				temp.append(par)
				continue
			if(par.get('class')=="performers" and par.find('a').contents[0].lower()==singer.lower()):
				compare += 1
				continue
		compare = 0
	if len(temp) != 0:
		for a in temp[0].findAll('a',href = True):
			return  search_album_name(a['href'])
	return -1

Example #28

0

Show file

File: filemanager.py Project: projectmagpy/MagPy

    def manageregex(self, pattern):
        out_folder = constants.fileloc + "files/"
        try:
            os.mkdir(out_folder)
        except:
            pass

        if self.direct:
            urlretrieve(self.url, out_folder + "/" + self.url.split()[-2:])
        else:
            soup = bs(urlopen(self.url))
            parsed = list(urlparse.urlparse(self.url))
            tota = soup.findAll("a")
            tot = len(tota)
            n = 0
            pat = re.compile(pattern)
            for a in tota:
                n += 1
                try:
                    if pat.match(str(a['href'])):
                        filename = a["href"].split("/")[-1]
                        parsed[2] = a["href"]
                        outpath = os.path.join(out_folder, filename)
                        if a['href'].lower().startswith("http"):
                            urlretrieve(a['href'], outpath)
                        else:
                            urlretrieve(urlparse.urljoin(self.url, a['href']), outpath)
                            yield (n*100)/tot
                except:
                    pass

Example #29

0

Show file

File: filemanager.py Project: projectmagpy/MagPy

    def manageimages(self):
        out_folder = constants.fileloc + "images/"
        try:
            os.mkdir(out_folder)
        except:
            pass

        if self.direct:
            urlretrieve(self.url, out_folder + "/" + self.url.split()[-2:])
        else:
            soup = bs(urlopen(self.url))
            parsed = list(urlparse.urlparse(self.url))
            totim = soup.findAll("img")
            tot = len(totim)
            n = 0
            for image in totim:
                n += 1
                filename = image["src"].split("/")[-1]
                parsed[2] = image["src"]
                outpath = os.path.join(out_folder, filename)
                try:
                    if image["src"].lower().startswith("http"):
                        urlretrieve(image["src"], outpath)
                    else:
                        urlretrieve(urlparse.urljoin(self.url, image["src"]), outpath)
                        yield (n*100)/tot
                except:
                    pass

Example #30

0

Show file

File: teevee.py Project: bialagary/mw

def get_episodes(season,season_num):
	url=season
	if domain not in url:
		url=domain + season
	html=read_url(url)
	soup=bs(html)
	tag=soup.find('div',{'class':'Episode'})
	reg=re.compile('<a href="(.+?)".+?>')
	links=list(re.findall(reg,str(tag)))
	reg2=re.compile('</strong> (\d+) - (.+?)</a>')
	names=re.findall(reg2,str(tag))
	out=[]
	last_num=0
	spec=addon.get_setting('specials')
	if spec=='false':
		for i in range(len(links)):
			check=int(names[i][0])-last_num
			if 'special:' not in names[i][1].lower() and check==1:
				out+=[[links[i],names[i][1],season_num,names[i][0]]]
				last_num=int(names[i][0])

		imdb=re.compile('[\"\']http://www.imdb.com/title/(.+?)[\"\']')
		imdb=re.findall(imdb,str(soup))[0]
		return imdb,out

	else:
		for i in range(len(links)):
			out+=[[links[i],names[i][1],season_num,names[i][0]]]

		imdb=re.compile('[\"\']http://www.imdb.com/title/(.+?)[\"\']')
		imdb=re.findall(imdb,str(soup))[0]
		return imdb,out

Example #31

0

Show file

File: get_album_genre2.py Project: jihyun300/DataScienceProject

def search_album(singer, song):
    url = 'http://www.allmusic.com/search/song/' + singer + '%20' + song
    i = 0
    for x in range(10):
        try:
            website_html = requests.get(url).text
            soup = bs(website_html)
        except requests.exceptions.RequestException as e:
            i += 1
            continue
        break
    if i >= 9:
        return -1

    compare = 0
    temp = []
    for line in soup.findAll("li", {'class': "song"}):

        for par in line.findAll("div"):
            if (par.get('class') == "title"
                    and par.text.replace('\"', '').lower() == song.lower()):
                compare += 1
                temp.append(par)
                continue
            if (par.get('class') == "performers"
                    and par.find('a').contents[0].lower() == singer.lower()):
                compare += 1
                continue
        compare = 0
    if len(temp) != 0:
        for a in temp[0].findAll('a', href=True):
            return search_album_name(a['href'])
    return -1

Example #32

0

Show file

def PLAY_FULL(name, url, iconimage):
    albumlist = []
    link = client.request(url)
    soup = bs(link)
    threads = []
    album_icon = iconimage
    print("ALBUM ICON", album_icon)
    r = soup.find('div', {'class': 'artist-songs'})
    global count

    reg = re.compile(
        '<div class="song-name"><a href="([^"]+)">(.*?)</a></div>')
    result = re.findall(reg, str(r))
    count = 0
    playlist = xbmc.PlayList(0)
    playlist.clear()
    progressDialog = control.progressDialog
    progressDialog.create('Karma', '')
    progressDialog.update(0)
    for url, title in result:
        if progressDialog.iscanceled(): break
        count += 1
        url = re.sub('/track/', '/download/', url)
        url = base_url + url
        title = client.replaceHTMLCodes(title)
        progress = (float(count) / float(len(result))) * 100
        progressDialog.update(int(progress),
                              'Retrieving and Checking Songs...', title)
        w = workers.Thread(fetch_album, url, title, album_icon)
        w.start()
        w.join()
    xbmc.Player().play(playlist)

Example #33

0

Show file

File: get_album_genre2.py Project: jihyun300/DataScienceProject

def search_album_name(url):
    if url == '':
        return -1
    i = 0
    for x in range(10):
        try:
            website_html = requests.get(url).text
            soup = bs(website_html)
        except requests.exceptions.RequestException as e:
            i += 1
            continue
        break
    if i >= 9:
        return -1

    albums = []
    for a in soup.findAll("td", {'class': "artist-album"}):
        if 'Various Artists' in a.find("span", {'itemprop': "name"}).text:
            continue
        for b in a.findAll("div", {'class': "title"}):
            albums.append(b.find('a').text)
    if len(albums) != 0:
        return albums
    else:
        return -1

Example #34

0

Show file

File: urlanalyzer.py Project: princas11/creepy

 def tp(self, url, tweet):
     '''
     api_location = {}
     try:
         json_reply= simplejson.load(urllib.urlopen("http://api.twitpic.com/2/media/show.json?id="+url.path[1:]))
         if 'location' in json_reply :
             api_location['from'] = 'twitpic_api'
             api_location['time'] = json_reply['timestamp']
             api_location['coordinates'] = json_reply['location']
     except simplejson.JSONDecodeError:
         #print "error produced by http://api.twitpic.com/2/media/show.json?id="+url.path[1:]
     '''
     try:
         #Handle some bad HTML in twitpic
         html = urllib.urlopen(url.geturl()).read()
         html = html.replace('</sc"+"ript>', '')
         soup = bs(html)
         #Grabs the photo from cloudfront
         temp_file = os.path.join(self.photo_dir, url.path[1:])
         photo_url = soup.find(attrs={
             "id": "content"
         }).find(src=re.compile("cloudfront"))['src']
         urllib.urlretrieve(photo_url, temp_file)
         return [self.exif_extract(temp_file, tweet)]
     except Exception:
         err = 'Error trying to download photo'
         self.errors.append({
             'from': 'twitpic',
             'tweetid': tweet.id,
             'url': url.geturl(),
             'error': err
         })
         return []

Example #35

0

Show file

File: baotw.py Project: Zhangrui19951107/whu_gs

    def parse_pages(self, response):

        self.log('Hi, this is the second page %s' % response.url)
        root = bs(response.body)
        forumurl = response.url
        pageid = forumurl[forumurl.index("orum-") +
                          5:forumurl.index("-1.html")]
        print forumurl
        try:
            pageText = root.find("div", attrs={
                "class": "pg"
            }).find("span").text
            pageText = pageText[pageText.index("/") + 2:]
            totalpage = pageText[:pageText.index(" ")]
        except:
            totalpage = 1
        root_url = "http://www.baotuowang.com/forum.php?mod=forumdisplay&fid="
        for i in range(1, int(totalpage) + 1):
            url = root_url + pageid + "&page=" + str(i)
            yield scrapy.Request(url,
                                 self.parse_page,
                                 meta={
                                     'forumurl': forumurl,
                                     'pageid': pageid
                                 })

Example #36

0

Show file

def check_spider():
    new = 0
    page = 1
    while new > 20:
        new = 0
        r = requests.get(url)
        root = bs(r.text)
        page += 1
        url = "http://www.cntongshan.com/News/NewsList-0-2-AA-p" + str(
            page) + ".html"

        uls = root.find("div", attrs={
            "class": "ListMain"
        }).findAll("ul", attrs={"class": "l_l"})
        for ul in uls:
            lis = ul.findAll("li")
            for li in lis:
                div = li.find("div")
                curl = div.find("a").get("href")
                li.find("div").clear()
                datestr = li.text
                month = datestr[:datestr.index("月")]
                day = datestr[datestr.index("月") + 1:datestr.index("日")]
                hour = datestr[datestr.index("日") + 2:datestr.index(":")]
                minute = datestr[datestr.index(":") + 1:]
                date_t = datetime(2015, int(month), int(day), int(hour),
                                  int(minute))
                if date_t > last_t:
                    new += 1
                    print date_t
                    spider_cnts(curl)

Example #37

0

Show file

File: scrapers.py Project: TVBOX4LTV/hieuhien.vn

def get_ttv():
    url='http://www.acesportstream.com'
    url=read_url(url)
    soup=bs(url)
    channels1=soup.find('div',{'id':'hd'}).findAll('a')
    channels2=soup.find('div',{'id':'blue'}).findAll('a')

    
    for channel in channels1:
        link=channel['href']
        img=channel.find('img')['src']
        name=clean(cleanex(channel['title']))

        url = build_url({'mode': 'open_ttv_stream','url':link, 'name':name.encode('ascii','ignore')})
        li = xbmcgui.ListItem('%s'%name, iconImage=img)
        li.setProperty('IsPlayable', 'true')

        xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li)
    for channel in channels2:
        link=channel['href']
        img=channel.find('img')['src']
        name=clean(cleanex(channel['title']))

        url = build_url({'mode': 'open_ttv_stream','url':link, 'name':name.encode('ascii','ignore')})
        li = xbmcgui.ListItem('%s'%name, iconImage=img)
        li.setProperty('IsPlayable', 'true')

        xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li)

    
    xbmcplugin.endOfDirectory(addon_handle)

Example #38

0

Show file

File: scores.py Project: hslawson/odds-scraper

def download( url ) :
	'''
		Pull the page and parse it into the pieces we need.
	'''
	cookieJar = cookielib.LWPCookieJar()
	if os.path.isfile( kCookieFile ) :
		cookieJar.load( kCookieFile )
	else :
		cookieJar.save( kCookieFile )
	opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar ))

	link = opener.open( url )

	page = link.read()
	soup = bs( page )


	scores = soup.findChildren( 'table', { "class" : "scores" } )

	for i, aSection in enumerate( scores ) :
		scoresArray = []
		away = ""
		home = ""
		teams = aSection.findChildren( None, { "class" : "yspscores team" } )
		for i, aTeam in enumerate( teams ) :
			name = aTeam.findChild( 'a' )
			if 0 == i :
				away = name.text
			else :
				home = name.text
		qtrScores = aSection.findChildren( None, { "class" : "yspscores" } )
		for i, qtr in enumerate( qtrScores ) :
			scoresArray.append( cleanText( qtr.text ))

		printScores( away, home, scoresArray )

Example #39

0

Show file

def Tureng(word):
    f = urllib.urlopen("http://tureng.com/en/turkish-english/" + str(word))
    soup = bs(f)
    dummy = 0
    emocan = 0
    for string in soup.findAll('a'):
        if int(dummy) > 20:
            wflag = 0
            for x in range(0, len(uList)):
                if string.string == uList[x]:
                    wflag = 1
                    break
            if wflag == 0:
                if string.string != None:
                    for tr in range(0, len(tr_Error)):
                        if tr_Error[tr] in string.string:
                            string.string = string.string.replace(
                                tr_Error[tr], fix[tr])
                    if emocan == 0:
                        try:
                            print string.string.decode('utf-8'), ' ---> ',
                        except:
                            print string.string, ' ---> ',
                        emocan = 1
                    else:
                        try:
                            print string.string.decode('utf-8')
                        except:
                            print string.string
                        emocan = 0
        dummy += 1

Example #40

0

Show file

File: crawler.py Project: dungdt88/aggiehack

def get_data_from_page(schedule):
        # get data from each tamu bus schedule page
        for route_page in route_pages:

                bus_page = urllib2.urlopen('http://transport.tamu.edu/busroutes/Route' + route_page + '.aspx')

                data =  bs(bus_page)

                # store schedule for this particular route
                route_bus_stops = []

                # get all bus stops and their schedule
                for table in data.findAll('table'):
                        #get bus stop names
                        for tr in table.findAll('tr')[1]:
                                route_bus_stops.append(tr.string.strip())
                        
                        #get schedule
                        for tr in table.findAll('tr')[2:]:
                                for td in tr.findAll('td'):
                                        if td.string not in blanks and td.nextSibling.string not in blanks:
                                                schedule.append((route_bus_stops[tr.index(td)], td.string, route_bus_stops[tr.index(td.nextSibling)], td.nextSibling.string, route_page))
                                                # print route_bus_stops[tr.index(td)], td.string, route_bus_stops[tr.index(td.nextSibling)], td.nextSibling.string, route_page
                print
                
        print 'Schedule:', len(schedule)

Example #41

0

Show file

File: snippet.py Project: someburner/GistsHub

def save_wordlist(raw_page):
	soup = bs(raw_page)
	wordlist = str.split(soup.__str__())
	f = open(PATH, 'a')
	for word in wordlist:
		f.write(word+'\n')
	f.close()

Example #42

0

Show file

def send_pings(post):
    logger.debug("send_pings entered")
    if settings.DEBUG:
        logger.warn("Not sending pings in debug")
        return
    if post.status == 'publish':
        # check for outgoing links.
        target_urls = []
        logger.debug("post.body")
        soup = bs(post.get_formatted_body())
        logger.debug(str(soup))
        for a in soup.findAll('a'):
            target_url = a.get('href', None)
            if target_url:
                logger.info("Got URL:" + a.get('href'))
                target_urls.append(target_url)

        logger.info("Checking out %d url(s)" % len(target_urls))
        for url in target_urls:
            pb_urls, tb_urls = get_ping_urls(url)
            for pb in pb_urls:
                logger.info("Got pingback URL: %s" % pb)
                pingback_ping(post.get_absolute_url(),
                              pb,
                              post=post,
                              outgoing=True)
            for tb in tb_urls:
                logger.info("Got trackback URL: %s" % url)
                trackback_ping(post.get_absolute_url(),
                               tb,
                               post=post,
                               outgoing=True)

Example #43

0

Show file

File: default.py Project: natko1412/natko-xbmc

def get_links_putlocker(show, season, episode):

    show = show.replace(' 2014', '').replace(' 2015', '')
    show = show.rstrip().replace(' ', '-').replace('!', '').replace(
        '?', '').replace('--', '')

    url = 'http://putlocker.is/watch-%s-tvshow-season-%s-episode-%s-online-free-putlocker.html' % (
        show, season, episode)
    print(url)
    read = read_url(url)

    soup = bs(read)
    table = soup.findAll(
        'table', {
            'class': 'table',
            'border': '0',
            'cellspacing': '0',
            'cellpadding': '0',
            'width': '100%'
        })[2]

    trs = table.findAll('tr')
    results = []

    reg = 'http://www.(.+?)/'
    pat = re.compile(reg)
    for i in range(len(trs)):

        try:
            link = trs[i].find('td', {'width': '100%'}).find('a')['href']
            title = re.findall(pat, link)[0]
            results.append([title, link])
        except:
            pass
    return results

Example #44

0

Show file

File: filemanager.py Project: projectmagpy/MagPy

    def managefiles(self):
        out_folder = constants.fileloc + "files/"
        try:
            os.mkdir(out_folder)
        except:
            pass

        if self.direct:
            urlretrieve(self.url, out_folder + "/" + self.url.split()[-2:])
        else:
            soup = bs(urlopen(self.url))
            parsed = list(urlparse.urlparse(self.url))
            tota = soup.findAll("a")
            tot = len(tota)
            n = 0
            for a in tota:
                n += 1
                filetype = a['href'].split(".")[-1]
                if filetype in self.format:
                    filename = a["href"].split("/")[-1]
                    parsed[2] = a["href"]
                    outpath = os.path.join(out_folder, filename)
                    try:
                        if a['href'].lower().startswith("http"):
                            urlretrieve(a['href'], outpath)
                        else:
                            urlretrieve(urlparse.urljoin(self.url, a['href']), outpath)
                            yield (n*100)/tot
                    except:
                        pass

Example #45

0

Show file

def get_category(site, page):
    if page == '1':
        pass
    else:
        site = site + '%s/' % page

    reg = 'href="(.+?)"'
    pat = re.compile(reg)

    req = urllib2.Request(
        url=site,
        headers={
            'User-Agent':
            ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
        })
    request = urllib2.urlopen(req)
    html = request.read()
    soup = bs(html)
    linksout = []
    tags = soup.findAll('article')
    for i in range(len(tags)):
        names = tags[i].find('h1', {'class': 'entry-title'})
        h = HTMLParser.HTMLParser()
        ime = h.unescape(names.getText())
        link = re.findall(pat, str(names))[0]
        img = tags[i].find('img')['src']
        ps = len(tags[i].findAll('p')) - 2

        linksout += [[link, ime, img]]

    return linksout

Example #46

0

Show file

File: get_album_genre2.py Project: inyeoplee77/DataScienceProject

def search_genre(singer, album):
	#genre = d.search(title, artist=singer)
	url = 'http://www.allmusic.com/search/albums/' + album + '%20' + singer
	i = 0
	for x in range(10):
		try:
			website_html = requests.get(url).text
			soup = bs(website_html)
		except requests.exceptions.RequestException as e:
			i+=1
			continue
		break
	if i>=9:
		return -1

	
	_genres={}
	for a in soup.findAll("div",{'class':"genres"}):
		for b in a.text.split(','):
			_genres[b] = 1
		break

	if _genres is not None:
		return _genres.keys()
	else:
		return -1

Example #47

0

Show file

File: teevee.py Project: bialagary/mw

def get_iwatch_links(url):
	try:
		links=[]
		hosts=[]
		html=read_url(url)
		soup=bs(html)
		table=soup.find('table',{'id':'streamlinks'})
		trs=table.findAll('tr')
		trs.pop(0)
		for i in range(len(trs)):
			try:
				item=trs[i]

			
				link=item.find('td').find('a')['href']
				host=item.find('td').find('a').getText().lstrip().rstrip().lower().lstrip('.')
				
				ind=host.index('<')
				links.append(link)
				hosts.append(host[:ind])
			except:
				pass
			
		

		return links,hosts
	except:
		return [],[]

Example #48

0

Show file

File: get_album_genre2.py Project: inyeoplee77/DataScienceProject

def search_album_name(url):
	if url == '':
		return -1
	i=0
	for x in range(10) :
		try:
			website_html = requests.get(url).text
			soup = bs(website_html) 
		except requests.exceptions.RequestException as e:
			i+=1
			continue
		break
	if i>=9:
		return -1

	albums = []
	for a in soup.findAll("td",{'class':"artist-album"}):
		if 'Various Artists' in a.find("span",{'itemprop':"name"}).text:
			continue
		for b in a.findAll("div",{'class':"title"}):
			albums.append(b.find('a').text)
	if len(albums)!=0:
		return albums
	else:
		return -1

Example #49

0

Show file

File: logic.py Project: lsh-0/endtimes

def discover_feeds(url):
    """
    Returns a list of possible candidate feeds found or an error message.

    Results are returned as a two-tuple of (success, data) where data
    will be an error message if not successful.

    """
    try:
        data = get_resource(url)
        logger.debug("got data")
        soup = bs(data)
        link_list = soup('link')
        candidates = []
        for link in link_list:
            mime = link.get('type')
            if mime in ['application/atom+xml', 'application/rss+xml']:
                uri = complete_uri(url, link.get('href'))
                candidates.append((link.get("title", uri), uri))
        if not candidates:
            return False, "No feeds found."
        return True, candidates
    except urllib2.URLError as e:
        if hasattr(e, 'reason'):
            msg = "I failed to reach a server: %s" % e.reason
            return False, msg
        elif hasattr(e, 'code'):
            msg = "The server couldn't fulfill our request (%s %s)" % (e.code, httplib.responses[e.code])
            return False, msg
    return False, "Unknown error."

Example #50

0

Show file

File: wett.py Project: chrisvxd/WETT

def warwick_ical():
    """Declare variables"""
    warwickurl = "http://www.eng.warwick.ac.uk/cgi-bin/timetable"

    bot = httpbot.HttpBot()
    bot.POST('https://websignon.warwick.ac.uk/origin/slogin?providerId=urn%3Awww.eng.warwick.ac.uk%3Ageneral%3Aservice&target=http://www.eng.warwick.ac.uk/cgi-bin/timetable', {'userName': warwickusername, 'password':warwickpassword})
    response = bot.GET(warwickurl)

    soup = bs(response)

    try:
        ical_url = soup.findAll('a')[0]['href'].replace('webcal','http')
    except KeyError:
        while 1:
            again = raw_input('Login Error! Try again? (y) ')
            if again == 'y':
                return warwick_ical()
                break
            elif again == 'n':
                exit(0)
            else:
                print 'Please enter y or n'
            
   
    return urllib2.urlopen(urllib2.Request(ical_url)).read()

Example #51

0

Show file

 def __user(self, user):
     try:
         start = time.time()
         inQueue = Queue()
         outQueue = Queue()
         processes = []
         links = bs(urlopen(baseURL + user + '/activity'),
                    parseOnlyThese=ss('a', href=re.compile('/post/a.')))
         for link in links.contents:
             if link['href'] not in self.visitedPosts:
                 inQueue.put(link['href'])
                 self.visitedPosts.append(link['href'])
         for i in range(cpu_count()):
             p = Process(target=Investigator.__posts,
                         args=(self, inQueue, outQueue))
             p.start()
             processes.append(p)
             inQueue.put('STOP')
         for p in processes:
             p.join()
         outQueue.put('STOP')
         for post in iter(outQueue.get, 'STOP'):
             self.listOfPosts.append(post)
         print "__user Elapsed Time: %s" % (time.time() - start), user
     except HTTPError:
         print 'HTTPError:', user

Example #52

0

Show file

File: scrapers.py Project: TVBOX4LTV/hieuhien.vn

def get_livefoot(url,name):
    names,links=[],[]
    html=read_url(url)
    soup=bs(html)
    tag=soup.find('div',{'id':'maininner'})
    tag=tag.find('div',{'class':'content clearfix'})
    trs=tag.findAll('tr')
    for item in trs:
        try:
            language=item.findAll('td')[0].getText()
            txt=item.findAll('td')[1].getText()
        except:
            language='[N/A]'
            txt=''
        if language=='':
            language='[N/A]'
        if 'acestream' in txt.lower() or 'sopcast' in txt.lower():
            link=item.findAll('td')[1].find('a')['href']
            title='%s %s'%(txt,language)
            links+=[link]
            names+=[title]
        else:
            pass

    if links!=[]:
        dialog = xbmcgui.Dialog()
        index = dialog.select('Select a channel:', names)
            
        if index>-1:
            name=names[index]
            url=links[index]
            
            play_livefoot(url,name)
    else:
        xbmcgui.Dialog().ok('No stream','No stream available yet!')

Example #53

0

Show file

def spider():

    print "start!"

    pwd = "/Users/bohaohan/iss/商务智能/code/img/"
    tail = ".png"
    url = "http://www.yeslux.com/pinpai.html"

    r = requests.get(url)
    r.encoding = 'gb2312'

    with open(pwd + "a" + tail, 'wb') as fd:
        for chunk in r.iter_content():
            fd.write(chunk)
    root = bs(r.text)
    div = root.find("div", attrs={'class': 'brand_main'})
    lis = div.findAll("li")

    for li in lis:
        img = li.find('img')
        name = img.get("alt")
        src = img.get("src")
        ir = requests.get(src, stream=True)
        with open(pwd + name + tail, 'wb') as fd:
            for chunk in ir.iter_content():
                fd.write(chunk)
        print name, src, "has been downloaded"

    print "finished!"

Example #54

0

Show file

File: redditscraper.py Project: f-r00t/democritique

def get_reddit_news():

	url = 'https://www.reddit.com/r/svenskpolitik/top/?sort=top&t=hour'

	b = mechanize.Browser()

	b.addheaders = [('User-agent', 'SvenskPolitikReaderBot 1.0')]

   	response = b.open(url).read()


   	soup = bs(str(response))


   	entries = soup.findAll("div", {"class" : "entry unvoted"})


	for entry in entries:


 		title = entry.a.text
		link = entry.a.get('href')
		domain = entry.span.text

		print title
		print link
		print domain


		if (domain != '(self.svenskpolitik)'):
			databaseConnector.insert_news(title, link)
		else:
			print 'Avoided self post!'

Example #55

0

Show file

File: scrape-fp.py Project: hslawson/odds-scraper

def doThreadComments( soup ) :
	'''
		doThreadComments needs a description...

	'''
	commentBlock = soup.findChild( None, { "class" : "posts" })
	commentRows = commentBlock.findAll( None, { "class" : "postbit postbitim postcontainer old" })
	for i, commentRow in enumerate( commentRows ) :
		# print commentRow
		userObj = commentRow.findChild( None, { "class" : "popupmenu memberaction" })
		poster = userObj.findChild( None, { "class" : re.compile( 'username' ) } )
		poster = cleanMsg( poster )

		date = cleanMsg( commentRow.findChild( None, { "class" : "date" }))
		date = date.replace( "&nbsp;", " " )

		print poster
		print date
		print

		# brute force strip all HTML data from message for now
		msgObj = commentRow.findChild( None, { "class" : "postcontent restore" })

		#msg = ''.join( bs( str( msgObj ) ).findAll( text=True )).strip()
		msg = cleanText( ''.join( bs( str( msgObj ) ).findAll( text=True )).strip() )
		
		print msg.encode( 'ascii', 'ignore' )

		print " =============================="

Example #56

0

Show file

File: default.py Project: natko1412/repo.natko1412

def get_links_putlocker(show,season,episode):

    show=show.replace(' 2014','').replace(' 2015','')
    show=show.rstrip().replace(' ','-').replace('!','').replace('?','').replace('--','')
    
    url='http://putlocker.is/watch-%s-tvshow-season-%s-episode-%s-online-free-putlocker.html'%(show,season,episode)
    print(url)
    read=read_url(url)

    soup=bs(read)
    table=soup.findAll('table',{'class':'table', 'border':'0','cellspacing':'0', 'cellpadding':'0', 'width':'100%'})[2]
    
    trs=table.findAll('tr')
    results=[]

    reg='http://www.(.+?)/'
    pat=re.compile(reg)
    for i in range(len(trs)):

        
        try:
            link=trs[i].find('td',{'width':'100%' }).find('a')['href']
            title=re.findall(pat,link)[0]
            results.append([title,link])
        except: pass
    return results

Example #57

0

Show file

File: get_album_genre2.py Project: jihyun300/DataScienceProject

def search_genre(singer, album):
    #genre = d.search(title, artist=singer)
    url = 'http://www.allmusic.com/search/albums/' + album + '%20' + singer
    i = 0
    for x in range(10):
        try:
            website_html = requests.get(url).text
            soup = bs(website_html)
        except requests.exceptions.RequestException as e:
            i += 1
            continue
        break
    if i >= 9:
        return -1

    _genres = {}
    for a in soup.findAll("div", {'class': "genres"}):
        for b in a.text.split(','):
            _genres[b] = 1
        break

    if _genres is not None:
        return _genres.keys()
    else:
        return -1