Python fetch Exemples, opener.fetch Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tester.py Projet : sabirmostofa/python-scraper

def get_all_series_links_thread(tot):

    # print 'inside a thread@@@@@@@@@@@@@!!!!!!!!!!!!'
    counter = 0
    for url in tot:
        # print 'getting from: %s' % url
        counter += 1
        if TEST:
            if counter == 2:
                break
        data = opener.fetch(url)["data"]
        # ~ url_to = '%s.html'%i
        # ~ f=open(url_to,'w')
        # ~ f.write(data)
        # ~ return
        soup = BeautifulSoup(data, "lxml")
        l = soup.find_all("a")
        reg = re.compile(r".*/watch-\d+-(.*)")
        for i in l:
            if not i.has_key("href"):
                continue

            link = i.get("href")
            m = reg.match(link)
            if m:
                # ~ series_name=i.get('title')
                # ~ if 'Watch' in series_name:
                # ~ series_name=series_name[6:-7].strip()
                series_link = "http://www.1channel.ch" + m.group(0)
                all_series.append(series_link)

Exemple #2

0

Afficher le fichier

Fichier : find_series_in_page.py Projet : cacophonix/1channel.ch

def generate_all_the_main_page_name():
	l=[]
	l.append('http://www.1channel.ch/?letter=123&tv')
	if TEST:
		return l
	for i in range(ord('a'),ord('z')+1):
		l.append('http://www.1channel.ch/?letter='+str(chr(i))+'&tv')

	#generating all pages with page number	
	all_pages = [] 
	for url in l:
		page_count=1
		data=opener.fetch(url)['data']
		soup=BeautifulSoup(data)
		l=soup.select('.pagination > a ')		

		if len(l) != 0:
			ref = l[len(l)-1]['href']
			reg=re.compile(r'.*?page=(\d+)')
			#~ print url, l[len(l)-1]['href']
			m=reg.match(ref)
			if m:
				page_count=int(m.group(1))
				#~ print page_count
		for i in range(1,page_count+1):
			all_pages.append(url+"=&page="+str(i))
	return all_pages

Exemple #3

0

Afficher le fichier

Fichier : views.py Projet : cacophonix/mukto-sure

def greetings(request):
    answer = 'this is purely random text'
    try:
        # response = requests.get('http://quotesondesign.com/api/3.0/api-3.0.json')
        # json_data = json.loads(response.text)
        # answer = json_data['quote']
        url='http://ivyjoy.com/quote.shtml'

        data=opener.fetch(url)['data']

        soup=bs(data)

        l=soup.text[1878:].split()

        l=l[:len(l)-1]

        t=" ".join(l)
        answer=t

    except:
        response = requests.get('http://quotesondesign.com/api/3.0/api-3.0.json')
        json_data = json.loads(response.text)
        answer = json_data['quote']

    answer = answer.replace('\r', ' ')
    answer = answer.replace('\n', ' ')
    answer = " ".join(answer.split())
    answer = 'Hello, Kitty! ' + answer
    return JsonResponse({"answer": answer})

Exemple #4

0

Afficher le fichier

Fichier : tester.py Projet : sabirmostofa/python-scraper

def generate_all_the_main_page_name():
    l = []
    l.append("http://www.1channel.ch/?letter=123&tv")
    # ~ if TEST:
    # ~ return l
    for i in range(ord("a"), ord("z") + 1):
        l.append("http://www.1channel.ch/?letter=" + str(chr(i)) + "&tv")

        # generating all pages with page number
    all_pages = []
    for url in l:
        page_count = 1
        data = opener.fetch(url)["data"]
        soup = BeautifulSoup(data, "lxml")
        l = soup.select(".pagination > a ")

        if len(l) != 0:
            ref = l[len(l) - 1]["href"]
            reg = re.compile(r".*?page=(\d+)")
            # ~ print url, l[len(l)-1]['href']
            m = reg.match(ref)
            if m:
                page_count = int(m.group(1))
                # ~ print page_count
        for i in range(1, page_count + 1):
            all_pages.append(url + "=&page=" + str(i))
    return all_pages

Exemple #5

0

Afficher le fichier

Fichier : tester.py Projet : sabirmostofa/python-scraper

def get_all_series_links(pages):
    (all_series, counter) = ([], 0)

    for url in pages:
        counter += 1
        if TEST:
            if counter == 2:
                break
        data = opener.fetch(url)["data"]
        # ~ url_to = '%s.html'%i
        # ~ f=open(url_to,'w')
        # ~ f.write(data)
        # ~ return
        soup = BeautifulSoup(data, "lxml")
        l = soup.find_all("a")
        reg = re.compile(r".*/watch-\d+-(.*)")
        for i in l:
            if not i.has_key("href"):
                continue
            if not i.has_key("title"):
                continue
            link = i.get("href")
            m = reg.match(link)
            if m:
                series_name = i.get("title")
                if "Watch" in series_name:
                    series_name = series_name[6:-7].strip()
                series_link = "http://www.1channel.ch" + m.group(0)
                all_series.append((series_name, series_link))

    return all_series

Exemple #6

0

Afficher le fichier

Fichier : movie_scraper.py Projet : sabirmostofa/python-scraper

def get_all_movies_links_thread(tot):
	
	#print 'inside a thread@@@@@@@@@@@@@!!!!!!!!!!!!'
	counter=0
	for url in tot:
		#print 'getting from: %s' % url
		counter+=1
		if TEST:
			if counter==2:
				break
		data=opener.fetch(url)['data']
		#~ url_to = '%s.html'%i
		#~ f=open(url_to,'w')
		#~ f.write(data)
		#~ return
		soup=BeautifulSoup(data,'lxml')
		l=soup.find_all('a')
		reg=re.compile(r'.*/watch-\d+-(.*)')
		for i in l:
			if not i.has_key('href'):
				continue;
			if not i.has_key('title'):
				continue;
			link =i.get('href')
			m=reg.match(link)
			if m:
				movie_link="http://www.1channel.ch"+m.group(0)
				if not movie_link in all_movies:
					#~ print 'New Link found: %s' % movie_link
					all_movies.append(movie_link)

Exemple #7

0

Afficher le fichier

Fichier : find_series_in_page.py Projet : cacophonix/1channel.ch

def get_all_series_links(pages):
	(all_series, counter)=([], 0)

	for url in pages:
		counter+=1
		if TEST:
			if counter==2:
				break
		data=opener.fetch(url)['data']
		#~ url_to = '%s.html'%i
		#~ f=open(url_to,'w')
		#~ f.write(data)
		#~ return
		soup=BeautifulSoup(data)
		l=soup.find_all('a')
		reg=re.compile(r'.*/watch-\d+-(.*)')
		for i in l:
			if not i.has_key('href'):
				continue;
			if not i.has_key('title'):
				continue;
			link =i.get('href')
			m=reg.match(link)
			if m:
				series_name=i.get('title')
				if 'Watch' in series_name:
					series_name=series_name[6:-7].strip()
				series_link="http://www.1channel.ch"+m.group(0)
				all_series.append((series_name, series_link))

	return all_series

Exemple #8

0

Afficher le fichier

Fichier : movie_scraper.py Projet : sabirmostofa/python-scraper

def i_have_got_movies_url((url,con)):
	
	data=opener.fetch(url)['data']
	soup=BeautifulSoup(data, 'lxml')
	
	released_date=datetime.today()
	
	try:
		l=soup.select('.movie_info > table ')
		l=(l[0].find_all('tr'))
		l=l[1].find_all('td')[1].text
		released_date=datetime.strptime(l,'%B %d, %Y')
		if released_date.year < 1900:
			released_date = datetime.today()
			
	except:
		pass
	
	imdb_id="-1"
	try:
		imdb_link=soup.select('.mlink_imdb')[0].find_all('a')
		imdb_link=imdb_link[0].get('href')
		if re.search(r'\d+', imdb_link):
			imdb_id = re.search(r'\d+', imdb_link).group(0)
	except:
		pass
		
	try:
		a=soup.findAll(attrs={"property":"og:title"})
		name = a[0]['content']
		if len(name) == 0:			
			print 'name length is zero : %s Url: %s' % (name, url)
			return
	except:
		pass
	
	movie_id=get_movies_id_in_database(name,imdb_id,released_date,con)
	if not movie_id:
		return
	l=soup.find_all('a')
	reg=re.compile(r'.*?url=(.+?)&domain.*')
	reg2=re.compile(r'.*external.php.*')
	
	for i in l:
		if not i.has_key('href'):
			continue
		ref=i['href']
		parsed=urlparse(ref)
		try:
			t1=parsed[2]
			if not reg2.match(t1):
				continue
			m=reg.match(parsed[4])
			final_url=standard_b64decode(m.group(1))
			insert_into_links_table(movie_id,final_url, con)
		except:
			pass

Exemple #9

0

Afficher le fichier

Fichier : series_scraper.py Projet : sabirmostofa/python-scraper

def get_put_unique_eps(url='show.html'):
	#~ print url
	data=opener.fetch(url)['data']
	soup = BeautifulSoup(data, 'lxml')
	title= get_title(soup)
	if len(title) == 0:
		#~ print 'title is null returning'
		return
	
	released_date=datetime.today()
	
	try:
		l=soup.select('.movie_info > table ')
		l=(l[0].find_all('tr'))
		l=l[1].find_all('td')[1].text
		released_date=datetime.strptime(l,'%B %d, %Y')
	except:
		pass
	
	imdb_id="-1"
	try:
		imdb_link=soup.select('.mlink_imdb')[0].find_all('a')
		imdb_link=imdb_link[0].get('href')
		if re.search(r'\d+', imdb_link):
			imdb_id = re.search(r'\d+', imdb_link).group(0)
	except:
		pass
	
	series_id = get_series_id_in_database(title, imdb_id, released_date, con)
	all_eps=soup(attrs={'class':'tv_episode_item'})

	
	# getting all eps except the transparent one 		
	all_eps[:] = [base+x('a')[0].get('href') if not 'transp2' in x['class'] else None for x in all_eps ]

	all_eps = list(set(all_eps))
	if None in all_eps:
		all_eps.remove(None)

	
	

	
	cur=con.cursor(mdb.cursors.DictCursor)
	for link in all_eps:
		#~ print 'episode link: %s' % link
		matches = re.search(r'season-(\d+)-episode-(\d+)', link)
		season = int(matches.group(1))
		episode = int(matches.group(2))		
		cur.execute("SELECT * FROM `vs_series_links` WHERE `series_id`=%s and `season`=%s and `episode`=%s" ,
		(series_id, season, episode))
		if cur.fetchone():
			continue
		#~ print 'Inserting new Episode: series: %s season: %s episode: %s' % (series_id, season, episode)
		i_have_got_series_episode_url(title, series_id, link, season, episode, con)

Exemple #10

0

Afficher le fichier

Fichier : second_tester.py Projet : sabirmostofa/python-scraper

def get_page_count_and_go_deeper(url):
	data=opener.fetch(url)['data']
	soup=BeautifulSoup(data)
	l=soup.select('.pagination > a ')
	ref = l[len(l)-1]['href']
	reg=re.compile(r'.*?page=(\d+).*?')
	page_count=1
	m=reg.match(ref)
	if m:
		page_count=int(m.group(1))
	
	for i in range(1,page_count+1):
		new_url=url+"&page="+str(i)
		i_have_got_page_number(url)

Exemple #11

0

Afficher le fichier

Fichier : series_scraper.py Projet : sabirmostofa/python-scraper

def initiator():
	tv_url = 'http://www.1channel.ch/?tv'
	featured = 'http://www.1channel.ch/index.php?sort=featured'
	data=opener.fetch(tv_url)['data']
	soup = BeautifulSoup(data, 'lxml')

	
	#Inserting latest shows
	latest_shows =  get_latest_ones(soup)
	#~ print latest_shows
	for show in latest_shows:
		get_put_unique_eps(show)
		
	# episodes prime times
	eps =[]	
	eps = fetch_prime_time_episodes(soup)
	for epi in eps:
		put_prime_time_eps(epi)
		
	#insert movies
	del data, soup
	data=opener.fetch(base)['data']
	soup = BeautifulSoup(data, 'lxml')
	latest_movies = get_latest_ones(soup)

	
	#check featured
	del data, soup
	data=opener.fetch(featured)['data']
	soup = BeautifulSoup(data, 'lxml')
	featured_movs = get_latest_ones(soup)
	
	to_parse = set(latest_movies+featured_movs)
	#~ print 'Parsing movies: %s ' % len(to_parse)
	
	for url in to_parse:
		i_have_got_movies_url(url)

Exemple #12

0

Afficher le fichier

Fichier : tester.py Projet : cacophonix/1channel.ch

def generate_main_pages_thread(url):
	print url
	page_count=1
	data=opener.fetch(url)['data']
	soup=BeautifulSoup(data)
	l=soup.select('.pagination > a ')		

	if len(l) != 0:
		ref = l[len(l)-1]['href']
		reg=re.compile(r'.*?page=(\d+)')
		#~ print url, l[len(l)-1]['href']
		m=reg.match(ref)
		if m:
			page_count=int(m.group(1))
			#~ print page_count
	for i in range(1,page_count+1):
		tot.append(url+"=&page="+str(i))

Exemple #13

0

Afficher le fichier

Fichier : second_tester.py Projet : sabirmostofa/python-scraper

def i_have_got_page_number(url):
	
	data=opener.fetch(url)['data']
	soup=BeautifulSoup(data)
	l=soup.find_all('a')
	reg=re.compile(r'.*/watch-\d+-(.*)')
	for i in l:
		if not i.has_key('href'):
			continue;
		if not i.has_key('title'):
			continue;
		link =i.get('href')
		m=reg.match(link)
		if m:
			series_name=i.get('title')
			series_link="http://www.1channel.ch"+m.group(0)
			i_have_got_series_name(series_link,series_name)

Exemple #14

0

Afficher le fichier

Fichier : series_scraper.py Projet : sabirmostofa/python-scraper

def put_prime_time_eps(link):
	data=opener.fetch(link)['data']
	soup = BeautifulSoup(data, 'lxml')
	title = get_title(soup)
	if len(title) == 0:
		return
		
	series_id = get_series_id(title)
	if not series_id:
		series_link = base + soup(attrs = {'class':'titles'})[1]('a')[0]['href']
		get_put_unique_eps(series_link)
		return
		
	 
	matches = re.search(r'season-(\d+)-episode-(\d+)', link)
	season = int(matches.group(1))
	episode = int(matches.group(2))
	i_have_got_series_episode_url(title, series_id, link, season, episode, con)

Exemple #15

0

Afficher le fichier

Fichier : tester.py Projet : sabirmostofa/python-scraper

def i_have_got_series_link(url, con):
    # ~ print 'Series:%s#%s' %(name,url)
    data = opener.fetch(url)["data"]
    soup = BeautifulSoup(data, "lxml")
    name = get_title(soup)
    if len(name) == 0:
        return
    released_date = datetime.today()

    try:
        l = soup.select(".movie_info > table ")
        l = l[0].find_all("tr")
        l = l[1].find_all("td")[1].text
        released_date = datetime.strptime(l, "%B %d, %Y")
    except:
        pass

    imdb_id = "-1"
    try:
        imdb_link = soup.select(".mlink_imdb")[0].find_all("a")
        imdb_link = imdb_link[0].get("href")
        if re.search(r"\d+", imdb_link):
            imdb_id = re.search(r"\d+", imdb_link)
    except:
        pass

    series_id_in_database = get_series_id_in_database(name, imdb_id, released_date, con)

    # getting all episodes
    all_eps = soup(attrs={"class": "tv_episode_item"})

    # getting all eps except the transparent one
    all_eps[:] = [base + x("a")[0].get("href") if not "transp2" in x["class"] else None for x in all_eps]

    all_eps = list(set(all_eps))
    if None in all_eps:
        all_eps.remove(None)

    for ep_link in all_eps:
        matches = re.search(r"season-(\d+)-episode-(\d+)", ep_link)
        season = int(matches.group(1))
        episode = int(matches.group(2))
        i_have_got_series_episode_url(name, series_id_in_database, ep_link, season, episode, con)

Exemple #16

0

Afficher le fichier

Fichier : find_series_in_page.py Projet : cacophonix/1channel.ch

def i_have_got_series_name((url,name,con)):
	#~ print 'Series:%s#%s' %(name,url)
	data=opener.fetch(url)['data']
	soup=BeautifulSoup(data)

	released_date=datetime.today()

	try:
		l=soup.select('.movie_info > table ')
		l=(l[0].find_all('tr'))
		l=l[1].find_all('td')[1].text
		released_date=datetime.strptime(l,'%B %d, %Y')
	except:
		pass

	imdb_link="-1"
	try:
		imdb_link=soup.select('.mlink_imdb')[0].find_all('a')
		imdb_link=imdb_link[0].get('href')
	except:
		pass

	series_id_in_database=get_series_id_in_database(name,imdb_link,released_date,con)

	l=soup.find_all('a')
	t1=url;
	t1=t1.replace('http://www.1channel.ch/watch','tv')
	t1='/'+t1+"/season-(\d+)-episode-(\d+).*"
	reg=re.compile(t1)

	for i in l:
		if not i.has_key('href'):
			continue
		m=reg.match(i.get('href'))
		if m:
			episode_link="http://www.1channel.ch"+m.group(0)
			season=m.group(1)
			episode=m.group(2)
			i_have_got_series_episode_url(name,series_id_in_database,episode_link,season,episode,con)

Exemple #17

0

Afficher le fichier

Fichier : find_series_in_page.py Projet : cacophonix/1channel.ch

def i_have_got_series_episode_url(name,series_id,url,season,episode,con):
	print '\n\nName: %s\nSeason: %s .Episode: %s' %(name,season,episode)
	data=opener.fetch(url)['data']
	soup=BeautifulSoup(data)
	l=soup.find_all('a')
	reg=re.compile(r'.*?url=(.+?)&domain.*')
	reg2=re.compile(r'.*external.php.*')

	for i in l:
		if not i.has_key('href'):
			continue
		ref=i['href']
		parsed=urlparse(ref)
		try:
			t1=parsed[2]
			if not reg2.match(t1):
				continue
			m=reg.match(parsed[4])
			final_url=standard_b64decode(m.group(1))
			set_season_episode(series_id,final_url,season,episode,con)
		except:
			pass

Exemple #18

0

Afficher le fichier

Fichier : find_series_in_page.py Projet : cacophonix/1channel.ch

def i_have_got_page_number(url):

	data=opener.fetch(url)['data']
	#~ url_to = '%s.html'%i
	#~ f=open(url_to,'w')
	#~ f.write(data)
	#~ return
	soup=BeautifulSoup(data)
	l=soup.find_all('a')
	reg=re.compile(r'.*/watch-\d+-(.*)')
	for i in l:
		if not i.has_key('href'):
			continue;
		if not i.has_key('title'):
			continue;
		link =i.get('href')
		m=reg.match(link)
		if m:
			series_name=i.get('title')
			if 'Watch' in series_name:
				series_name=series_name[6:-7].strip()
			series_link="http://www.1channel.ch"+m.group(0)
			i_have_got_series_name(series_link,series_name)

Exemple #19

0

Afficher le fichier

Fichier : tester.py Projet : sabirmostofa/python-scraper

def i_have_got_series_episode_url(name, series_id, url, season, episode, con):
    # print '\n\nName: %s\nSeason: %s .Episode: %s' %(name,season,episode)
    # data=urllib2.open(url).read()

    data = opener.fetch(url)["data"]
    soup = BeautifulSoup(data, "lxml")
    l = soup.find_all("a")
    reg = re.compile(r".*?url=(.+?)&domain.*")
    reg2 = re.compile(r".*external.php.*")

    for i in l:
        if not i.has_key("href"):
            continue
        ref = i["href"]
        parsed = urlparse(ref)
        try:
            t1 = parsed[2]
            if not reg2.match(t1):
                continue
            m = reg.match(parsed[4])
            final_url = standard_b64decode(m.group(1))
            set_season_episode(series_id, final_url, season, episode, con)
        except:
            pass