def domestic_summary(soup): #separate by table results = {} for s in soup: subtables = s.select("td") header = subtables[0] label = header.text[:-1].replace('\xa0', ' ').strip() data = subtables[1].text.replace('\xa0', ' ').strip() if label == 'Release Dates': data = re.findall(r"[a-zA-Z]* \d{1,2}, \d{4}", data) results['limited_release_date'] = utils.processDate( "%B %d, %Y", data[0].strip()) results['wide_release_date'] = utils.processDate( "%B %d, %Y", data[1].strip()) elif label in [ 'Limited Opening Weekend', 'Wide Opening Weekend', 'Opening Weekend' ]: keys = { 'Limited Opening Weekend': 'ow_limited', 'Wide Opening Weekend': 'ow_wide', 'Opening Weekend': 'ow' } key = keys[label] results[key] = {} results[key]['id'] = {'year': None, 'weekend_number': None} weekend_url = header.find("a") if weekend_url: results[key]['id']['year'] = utils.get_id( weekend_url['href'], 'yr') results[key]['id']['weekend_number'] = utils.get_id( weekend_url['href'], 'wknd') results[key]['gross'] = utils.processNumber( data, ['$', ','], 'float') data = subtables[2].text.replace(',', '').strip() data = re.findall(r"\d+", data) results[key]['rank'] = int(data[0]) results[key]['theaters'] = int(data[1]) elif label in ['Widest Release', 'In Release']: keys = { 'Widest Release': 'widest_release', 'In Release': 'days_in_release' } key = keys[label] data = data.split()[0] results[key] = utils.processNumber(data, [','], 'int') elif label == 'Close Date': results['close_date'] = utils.processDate("%B %d, %Y", data) return results
def processDateTokens(tags): field=['DATE'] for key in tags: if key in field: for i,val in enumerate(tags[key]): raw=" ".join(val) normDate=utils.processDate(raw) tags[key][i]=normDate return tags
def processDateTokens(tags): field = ['DATE'] for key in tags: if key in field: for i, val in enumerate(tags[key]): raw = " ".join(val) normDate = utils.processDate(raw) tags[key][i] = normDate return tags
def scrape_person_chart(soup): results = {} keys = [ "title", "studio", "domestic_gross", "lifetime_theaters", "ow_gross", "ow_theaters", "date" ] h1 = soup.find("h1") if h1: name = h1.text.strip() results[name] = {} navtabs = soup.select("ul.nav_tabs li") for i, li in enumerate(navtabs): role = li.text.lower() results[name][role] = [] if i > 0: role_url = "https://www.boxofficemojo.com" + li.find( "a")['href'] soup = utils.download(role_url) navtabs = soup.select("ul.nav_tabs li") li = navtabs[i] table = li.find_next("table") rows = table.find_all("tr", {"bgcolor": ["#ffffff", "#f4f4ff"]}) for r in rows: row_dict = {} cols = r.select("td")[1:] for j, c in enumerate(cols): key = keys[j] data = c.text.strip() if key == 'title': row_dict["movie"] = {} row_dict["movie"]["id"] = utils.get_id( c.find("a")['href'], 'id') row_dict["movie"]["title"] = data elif key == 'studio': row_dict["studio"] = {} row_dict["studio"]["id"] = utils.get_id( c.find("a")['href'], 'studio') row_dict["studio"]["name"] = data else: if data in ["-", "n/a"]: data = None else: if 'gross' in key: data = float(data[1:].replace(',', '')) elif 'theaters' in key: data = int(data.replace(',', '')) else: data = utils.processDate("%m/%d/%y", data) row_dict[key] = data results[name][role].append(row_dict) return results
def get_metadata(soup, results): results["title"] = None results["metadata"] = {} keys = [ "title", "studio", "release_date", "genre", "runtime_mins", "mpaa_rating", "budget" ] table = soup.find( "table", { "border": "0", "cellpadding": "0", "cellspacing": "0", "width": "100%", "style": "padding-top: 5px;" }) if table: extra = table.find("td", {"align": "center", "colspan": "2"}) if extra: extra.decompose() metadata_bold = table.find_all("b") for i, key in enumerate(keys): m = metadata_bold[i] metadata = m.text.strip() if metadata.lower() in ['n/a', 'tbd']: metadata = None if key == 'studio': url = m.select_one("a")['href'] studio_id = utils.get_id(url, "studio") metadata = {"id": studio_id, "name": metadata} elif key == 'budget' and metadata is not None: metadata = metadata.replace(' million', '000000') metadata = utils.processNumber(metadata, ["$", ","], 'float') elif key == 'runtime_mins' and metadata is not None: metadata = utils.convert_runtime(metadata) elif key == 'release_date' and metadata is not None: metadata = utils.processDate("%B %d, %Y", metadata) if key == 'title': results[key] = metadata else: results['metadata'][key] = metadata return results
def scrape_movie_search_data(movie_row): keys = [ "title", "studio", "domestic_gross", "max_theaters", "ow_gross", "ow_theaters", "release_date" ] details = {} cols = movie_row.find_all("td") for i, k in enumerate(keys): col = cols[i].text.strip() if col.lower() == 'n/a': col = None if k == "title": url = cols[i].select_one("a")['href'] movie_id = utils.get_id(url, 'id') details["id"] = movie_id elif k == 'release_date' and col.lower() not in ['multiple', 'tbd']: col = utils.processDate("%m/%d/%Y", col) elif k in ['domestic_gross', 'ow_gross'] and col is not None: col = utils.processNumber(col, [",", "$"], 'float') elif k in ['max_theaters', 'ow_theaters'] and col is not None: col = utils.processNumber(col, [","], 'int') details[k] = col return details
def scrape_daily(soup): results = {} keys = [ "date", "rank", "gross", "yd_change", "lw_change", "theaters", "gross_to_date" ] container = soup.select_one('table[width="95%"] center') if container.text.strip() != "NO DAILY DATA AVAILABLE": rows = container.find_all("tr", {"bgcolor": ["#ffffff", "#f4f4ff"]}) for r in rows: all_cols = r.find_all("td") cols = [] for i in range(len(all_cols)): if 1 <= i <= 6 or i == 8: cols.append(all_cols[i]) datekey = None for i, c in enumerate(cols): key = keys[i] data = c.text.strip() if key == 'date': datekey = utils.processDate("%b %d, %Y", data.replace('.', '')) results[datekey] = {} else: if data == '-': data = None else: if key in ['yd_change', 'lw_change']: data = round( float(data[:-1].replace(',', '')) / 100, 3) elif key in ['gross', 'gross_to_date']: data = float(data[1:].replace(',', '')) else: data = int(data.replace(',', '')) results[datekey][key] = data return results
rentDataFrame['rentType'] = [] # e.g. by sharing or individual for eachRow in dataList: # use id to avoid duplicated rent request try: rentDataFrame['id'].append(eachRow['_id']) except: continue try: rentDataFrame['price'].append(processPrice(eachRow['jiage'])) except: rentDataFrame['price'].append(None) try: rentDataFrame['inDate'].append(processDate(eachRow['indate'])) except: rentDataFrame['inDate'].append(None) try: rentDataFrame['rentArea'].append(eachRow['rentarea']) except: rentDataFrame['rentArea'].append(None) try: rentDataFrame['houseType'].append(eachRow['huxings']) except: rentDataFrame['houseType'].append(None) try: rentDataFrame['rentType'].append(eachRow['fangshi'][0]['type'])