コード例 #1
0
ファイル: task10.py プロジェクト: virusaman/IMDB-website
def analyse_language_and_directors():

	dirctors_details={}
	
	for i in range(250):
		file=find_position(scrape_top_list()[i])
		file_dict=json.loads(file)
		
		for j in file_dict['Director']:
			Language={}
			for k in range(250):
				data=json.loads(find_position(scrape_top_list()[k]))
				if j in data['Director']:

					for lang in data['Language']:
						if lang not in Language:
							Language[lang]=1
						else:
							Language[lang]+=1
			dirctors_details[j]=Language
			print(dirctors_details)
コード例 #2
0
def movies_json():

    n = scrape_top_list()
    for i in n:
        o = i["url"]
        # print(o)
        l = scrape_movie_details(o)

        movies_link = (i["url"][27:36])
        m = movies_link + ".json"
        with open(m, "w+") as json_file:
            json.dump(l, json_file)
        print("succss")
コード例 #3
0
def analyse_movies_genres():

    Genres_dict = {}
    for i in range(250):
        data = find_position(scrape_top_list()[i])
        data_dict = json.loads(data)

        for j in data_dict['Genres']:
            if j not in Genres_dict:
                Genres_dict[j] = 1
            else:
                Genres_dict[j] += 1
    return Genres_dict
コード例 #4
0
def get_movies_list_details():
    get_list = []
    if os.path.exists('movies_list.json'):
        with open('movies_list.json', 'r') as file:
            new_file = json.load(file)
        return new_file
    else:
        for i in range(5):
            data = scrape_movies_details(scrape_top_list()[i])
            get_list.append(data['cast'])

        with open('movies_list.json', 'w+') as file:
            dump_data = json.dump(get_list, file)
        return get_list
コード例 #5
0
ファイル: task9.py プロジェクト: patel-deepak506/Imdb_scrape
def json_data():
    a = scrape_top_list()
    for i in a:
        b = i["url"]
        # print(b)
        time1 = random.randint(1, 4)
        time.sleep(time1)
        c = scrape_movie_details(b)
        main_list.append(c)

        link = (i["url"][27:36])
        d = link + ".json"
        with open(d, "w+") as json_file:
            json.dump(c, json_file)
        with open("all_movies_details.json", 'w+') as json_data:
            json.dump(main_list, json_data)
        print("success")
コード例 #6
0
########################## task 7 ################################:
import requests, json
from pprint import pprint
from bs4 import BeautifulSoup
from task5 import get_movie_list_details
from task1 import scrape_top_list


def analyse_movies_directors(movie_dirceter):
    dirceters = {}
    for dirceter in movie_dirceter:
        a = ""
        for i in dirceter["director"]:
            a += i
        if a not in dirceters:
            dirceters[a] = 1
        else:
            dirceters[a] += 1
    return dirceters


dirceter_analyse = analyse_movies_directors(
    get_movie_list_details(scrape_top_list()[0:5]))
pprint(dirceter_analyse)
コード例 #7
0
from task1 import scrape_top_list
from pprint import pprint


def group_by_decade(movies):
    years = {}
    for i in movies:
        year = i["year"]
        year = str(year)
        decade = year[0:3]
        decade += "0"
        decade = int(decade)
        if decade not in years:
            years[decade] = []
            years[decade].append(i)
        # else:
        # 	years[decade].append(i)
    return years


byDecade = group_by_decade(scrape_top_list())
pprint(byDecade)
コード例 #8
0
from task1 import scrape_top_list
import requests
import json
from bs4 import BeautifulSoup
from pprint import pprint

movies_name = scrape_top_list()

year_1 = []
year_2 = []


def group_by_year():
    i = 0
    while i < len(movies_name):
        year = movies_name[i]["year"]
        year_1.append(year)
        i = i + 1
    j = 0
    while j < len(year_1):
        if year_1[j] not in year_2:
            year_2.append(year_1[j])
        j = j + 1
    k = 0
    while k < len(year_2):
        l = 0
        while l < len(year_2):
            if year_2[k] < year_2[l]:
                a = year_2[k]
                year_2[k] = year_2[l]
                year_2[l] = a
コード例 #9
0
ファイル: task10.py プロジェクト: madhu20336/web-scraping
        movie_url = requests.get(movie_api)
        soup = BeautifulSoup(movie_url.text, "html.parser")
        director_name = soup.find("div",
                                  class_="credit_summary_item").a.get_text()
        director.append(director_name)
        bio = soup.find("div", class_="plot_summary")
        movie_bio = bio.find("div", class_="summary_text").get_text().strip()
        detail = soup.find("div",
                           attrs={
                               "class": "article",
                               "id": "titleDetails"
                           })
        div1 = detail.find_all("div")
        for i in div1:
            run = i.find_all("h4")
            for j in run:
                if "Language:" in j:
                    lan = i.find_all("a")
                    for lang_uage in lan:
                        movie_language = lang_uage.get_text()
                        language.append(movie_language)
        detail_mov["director"] = director
        detail_mov["language"] = language
        List.append(detail_mov.copy())
        with open("get_movie_list_details.json", "w") as movie:
            json.dump(List, movie, indent=4)
    return List


top_movie_list = analyse_language_and_directors(scrape_top_list())
コード例 #10
0
from task1 import scrape_top_list
import requests
import json
from bs4 import BeautifulSoup 
from pprint import pprint

list = scrape_top_list()
year = []
def group_by_decade():
    i = 0
    while i < len(list):
        year.append(list[i]["year"])
        i += 1
    year.sort()
    j = 0
    dict = {}
    while j < len(year):
        year_1 = (year[j]//10)*10
        k = 0
        list_1 = []
        while k < len(list):
            if list[k]["year"] >= year_1 and list[k]["year"] < (year_1 + 10):
                list_1.append(list[k])
            dict[year_1] = list_1
            k += 1
        j += 1
        with open(" decade.json","w") as saral_data3:
            json.dump(dict,saral_data3,indent=4)
group_by_decade()

コード例 #11
0
		poster_image_url=soup.find("div",class_="poster").a["href"]
		poster_image = "https://www.imdb.com" + poster_image_url

		sub_div = soup.find("div",class_="article",id="titleDetails")
		details =sub_div.find_all("div",class_="txt-block")
		a=[]
		for i in details:
			if "Language"in i.text:
				language=i.find("a").text


			if "Country" in i.text:
				country = i.find("a").text

		movie_details_dict = {"name": "","Director":" ","bio":"","runtime":"","gener":"","poster_image_url":"","country":" ","language":""}	
		# movie_details_dict={}
		movie_details_dict["name"] = movie_name
		movie_details_dict["Director"] = Director
		movie_details_dict["bio"] = movie_bio
		movie_details_dict["runtime"] = movie_time
		movie_details_dict["gener"] = genre
		movie_details_dict["poster_image_url"] = poster_image
		movie_details_dict["country"] = country
		movie_details_dict["language"] = language
		movie_details_list.append(movie_details_dict)

	return (movie_details_list)

movie_list=scrape_top_list()
if __name__ == "__main__":
	pprint(scrape_movie(movie_list[0:10]))
コード例 #12
0
ファイル: task3.py プロジェクト: virusaman/IMDB-website
import pprint, string
from task1 import scrape_top_list


def group_by_decades(scrape_top_list):
    group = {}
    for i in scrape_top_list:
        value = i['year']

        value = value // 10
        new_value = int(str(value) + '0')
        if new_value not in group:
            group[new_value] = []
            group[new_value].append(i)
        else:
            group[new_value].append(i)
    return group


pprint.pprint(group_by_decades(scrape_top_list()))
コード例 #13
0
import requests
from task1 import scrape_top_list
import json
from pprint import pprint
from bs4 import BeautifulSoup

name1 = scrape_top_list()
def scrape_movie_cast():
    list_1 = []
    movie_number = int(input("Enter the movie number: "))
    cast_api = name1[movie_number]["url"]
    cast_url = requests.get(cast_api)
    soup = BeautifulSoup(cast_url.text,"html.parser")
    cast = soup.find("table","cast_list")
    td = cast.find_all("td",class_="")
    for i in td:
        my_dict = {}
        id = i.a["href"][6:15]
        artist = i.a.get_text().strip()
        my_dict["artist"] = artist
        my_dict["imbd_id"] = id
        list_1.append(my_dict)
        # print(list1)
    with open("artist_name.json","w") as saral:
        json.dump(list_1,saral,indent=4)
    return(list_1) 
scrape_movie_cast()
コード例 #14
0
        # print(Movie)
        final_data = {
            'Movie': '',
            'Director': '',
            'Country': '',
            'Language': '',
            'Poster': '',
            'Runtime': '',
            'Genres': ''
        }
        final_data['Movie'] = Movie.strip()

        final_data['Director'] = dir_list
        final_data['Country'] = data_country
        final_data['Language'] = Language
        final_data['Poster'] = poster_url
        final_data['Runtime'] = new_time
        final_data['Genres'] = Genres

        # Add task 13 in task 4
        final_data['cast'] = Scrape_movie_cast(url)
        # print(final_data)
        with open(url_movie['Name'] + '.json', 'w+') as file:
            file_new = json.dump(final_data, file)

            return final_data


pprint(scrape_movies_details(scrape_top_list()[0]))
# scrape_movies_details(value)
コード例 #15
0
ファイル: task6.py プロジェクト: virusaman/IMDB-website
from tast5 import get_movies_list_details
from task1 import scrape_top_list


def Analyse_movies_language(movie_list):
    Language_list = {}

    for i in movie_list:
        for j in i['Language']:
            if j not in Language_list:
                Language_list[j] = 1
            else:
                Language_list[j] += 1
    return Language_list


print(Analyse_movies_language(get_movies_list_details(
    scrape_top_list()[0:10])))
コード例 #16
0
import json, requests
from pprint import pprint
from bs4 import BeautifulSoup
from task1 import scrape_top_list

url = scrape_top_list()


def scrape_movie_cast():
    for i in url:
        link = (i["url"])
        link_list = (link + "fullcredits?ref_=tt_cl_sm#cast")
        page = requests.get(link_list)
        soup = BeautifulSoup(page.text, "html.parser")
        main_div = soup.find("div", class_="article listo")
        table = main_div.find("table", class_="cast_list")
        table_data = table.find_all("td", class_="")

        main_list = []
        for data in table_data:
            cast_name = (data.text.strip())
            link_name = (data.find("a").get("href"))
            imdb_id = (link_name[6:15])
            dct = {"imdb_id": "", "name": ""}
            dct["imdb_id"] = imdb_id
            dct["name"] = cast_name
            main_list.append(dct)
            dum = json.dumps(main_list)
            # print(main_list)

        with open("main_list.json", "w+") as json_data:
コード例 #17
0
import json, requests
from pprint import pprint
from bs4 import BeautifulSoup
from task4 import scrape_movie_details
from task1 import scrape_top_list

data = scrape_top_list()
print(data)


def movies_json():

    n = scrape_top_list()
    for i in n:
        o = i["url"]
        # print(o)
        l = scrape_movie_details(o)

        movies_link = (i["url"][27:36])
        m = movies_link + ".json"
        with open(m, "w+") as json_file:
            json.dump(l, json_file)
        print("succss")


movies_json()
コード例 #18
0
from task8 import find_position
import os ,json,requests,time,pprint
from task1 import scrape_top_list
from task4 import scrape_movies_details


for i in range(250):
	pprint.pprint(find_position(scrape_top_list()[i]))
	time.sleep(5)





コード例 #19
0
    div = soup.find('div', class_='article', id='titleDetails')
    div1 = div.find_all("div", class_="txt-block")
    for i in div1:
        try:
            if i.h4.text == "Country:":
                country = (i.a.text)
                # print (country)
            elif i.h4.text == "Language:":
                language_a = i.find_all('a')
                language_lis = []
                for j in language_a:
                    language = ""
                    language += j.text
                    language_lis.append(language)
                # print(language_lis)
        except AttributeError:
            continue
    dic = {}
    dic["movie_name"] = movie_name
    dic["director"] = director
    dic["country"] = country
    dic["language"] = language_lis
    dic["poster_url"] = poster_link
    dic["bio"] = movie_bio
    dic["rumtime"] = movie_runtime
    dic["gener"] = gener
    return dic


movieData = scrape_movie_details(scrape_top_list()[0]["url"])
# pprint(movieData)
コード例 #20
0
ファイル: task7.py プロジェクト: virusaman/IMDB-website
from tast5 import get_movies_list_details
from task1 import scrape_top_list


def Analyse_movies_directors(movie_list):

    Director_list = {}

    for i in movie_list:
        for j in i['Director']:
            if j not in Director_list:
                Director_list[j] = 1
            else:
                Director_list[j] += 1
    return Director_list


print(Analyse_movies_directors(get_movies_list_details(
    scrape_top_list()[0:5])))
コード例 #21
0
import requests,pprint,os,json
from bs4 import BeautifulSoup
from task5 import get_movie_list_details
from task1 import scrape_top_list
from pprint import pprint

def analyse_movies_language(movie_list):
	languages = {}
	for movie in movie_list:
		for lang in movie["language"]:
			print(lang)
	# 		if lang not in languages:
	# 			languages[lang]=1
	# 		else:
	# 			languages[lang]+=1
	# return languages

language_analysis = analyse_movies_language(get_movie_list_details(scrape_top_list()[0:5]))
# pprint(language_analysis)


コード例 #22
0
ファイル: task4.py プロジェクト: madhu20336/web-scraping
    runtime = time.find("time").get_text().strip()
    hour_to_min = (int(runtime[0])) * 60
    i = 0
    mins = ""
    a = runtime[3:]
    while i < len(a):
        if a[i] == "m":
            break
        mins = mins + a[i]
        i = i + 1
    runtime_of_movie = hour_to_min + int(mins)
    movie_genre = time.find_all("a")
    movie_genre.pop()
    for i in movie_genre:
        genre_1 = i.get_text()
        genre.append(genre_1)
    detail_mov["movie_name"] = list[movie_number - 1]["name"]
    detail_mov["director"] = director
    detail_mov["country"] = "India"
    detail_mov["poster_url"] = movie_poster
    detail_mov["language"] = language
    detail_mov["movie_bio"] = movie_bio
    detail_mov["runtime"] = runtime_of_movie
    detail_mov["movie_genre"] = genre
    with open("movie_details.json", "w") as movie_number:
        json.dump(detail_mov, movie_number, indent=4)
    return (detail_mov)


scrape_movie_details(scrape_top_list())
コード例 #23
0
        i = 0
        mins = ""
        a = runtime[3:]
        while i < len(a):
            if a[i] == "m":
                break
            mins = mins + a[i]
            i = i + 1
        runtime_of_movie = hour_to_min + int(mins)
        movie_genre = time.find_all("a")
        movie_genre.pop()
        for i in movie_genre:
            genre_1 = i.get_text()
            genre.append(genre_1)
        # print(genre_1)
        detail_mov["movie_name"] = name_2["name"]
        detail_mov["director"] = director
        detail_mov["country"] = "India"
        detail_mov["poster_url"] = movie_poster
        detail_mov["language"] = language
        detail_mov["movie_bio"] = movie_bio
        detail_mov["runtime"] = runtime_of_movie
        detail_mov["movie_genre"] = genre
        List.append(detail_mov.copy())
        with open("10_movie_details.json", "w") as movie:
            json.dump(List, movie, indent=4)
    return List


top_movie_list = get_movie_details(scrape_top_list())