/
film_scrape2.py
89 lines (78 loc) · 3.12 KB
/
film_scrape2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 4 22:20:13 2016
@author: Sarick Shah
"""
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import re
from collections import defaultdict
from pandas import Series, DataFrame
import numpy as np
from bs4 import BeautifulSoup
import re
import sys
sys.setrecursionlimit(20000)
import unidecode
import unicodedata
#%%
def solver(list_of_variables, mpaaRating):
list_to_append = []
for category in list_of_variables:
try:
if category == 'Rotten Tomatoes':
rating_index = mpaaRating.index(category)+3
rating = mpaaRating[rating_index:rating_index+2]
list_to_append.extend([rating])
else:
category = (mpaaRating[mpaaRating.index(category)+1])
list_to_append.append(category)
except (ValueError, AttributeError):
#print "category didn't work"
list_to_append.append(None)
return list_to_append
#%%
response = urllib2.urlopen('http://www.the-numbers.com/movie/budgets/all')
main_doc = response.read()
def txt_link_downloader(html_link):
soup = BeautifulSoup(html_link, 'html.parser')
list_df = []
batch = soup.find_all('td')
counter = 0
for index,i in enumerate(xrange(0,len(batch),6)):
list_df.append(map(lambda x: x.get_text(), batch[i:i+6]))
url_end = BeautifulSoup(batch[i+2].encode('utf-8'),'html.parser').find('a').get('href')
url = 'http://www.the-numbers.com' + url_end
list_df[index].append(url)
response = urllib2.urlopen(url)
main_doc = response.read()
soup = BeautifulSoup(main_doc,'html.parser')
mpaaRating = []
for tr in soup.findAll('tr'):
for td in tr.findAll('td'):
mpaaRating.append(td.get_text())
mpaaRating = [unidecode.unidecode(x).strip() for x in mpaaRating]
list_of_variables = ['Genre:','Running Time:','MPAA Rating:','Production Companies:','Domestic Releases:','Domestic DVD Sales','Domestic Blu-ray Sales','Total Domestic Video Sales','Rotten Tomatoes']
second_page = solver(list_of_variables,mpaaRating)
list_df[index].extend(second_page)
response = urllib2.urlopen(url)
main_doc = response.read()
soup = BeautifulSoup(main_doc,'html.parser')
soup = soup.find(text = re.compile('Weekend Box Office Performance')).parent.parent.find('div', attrs = {"id": "box_office_chart"})
try:
soup = soup.get_text()
soup = unicodedata.normalize('NFKD', soup).encode('utf-8').split()[4:35]
soup.insert(3,'None')
list_df[index].extend(soup)
except:
pass
counter += 1
#sets upper limit, max is 5230 as of 10/9/2016
if counter == 2000:
return DataFrame(list_df)
import copy
list_df = txt_link_downloader(main_doc)
#%%
df_deepcopy = copy.deepcopy(list_df)
df_deepcopy.to_csv('sarickmovies3.csv', encoding= 'utf-8' )