Esempio n. 1
0
def domestic_summary(soup):  #separate by table
    results = {}
    for s in soup:
        subtables = s.select("td")
        header = subtables[0]
        label = header.text[:-1].replace('\xa0', ' ').strip()
        data = subtables[1].text.replace('\xa0', ' ').strip()
        if label == 'Release Dates':
            data = re.findall(r"[a-zA-Z]* \d{1,2}, \d{4}", data)
            results['limited_release_date'] = utils.processDate(
                "%B %d, %Y", data[0].strip())
            results['wide_release_date'] = utils.processDate(
                "%B %d, %Y", data[1].strip())
        elif label in [
                'Limited Opening Weekend', 'Wide Opening Weekend',
                'Opening Weekend'
        ]:
            keys = {
                'Limited Opening Weekend': 'ow_limited',
                'Wide Opening Weekend': 'ow_wide',
                'Opening Weekend': 'ow'
            }
            key = keys[label]
            results[key] = {}
            results[key]['id'] = {'year': None, 'weekend_number': None}
            weekend_url = header.find("a")
            if weekend_url:
                results[key]['id']['year'] = utils.get_id(
                    weekend_url['href'], 'yr')
                results[key]['id']['weekend_number'] = utils.get_id(
                    weekend_url['href'], 'wknd')
            results[key]['gross'] = utils.processNumber(
                data, ['$', ','], 'float')
            data = subtables[2].text.replace(',', '').strip()
            data = re.findall(r"\d+", data)
            results[key]['rank'] = int(data[0])
            results[key]['theaters'] = int(data[1])
        elif label in ['Widest Release', 'In Release']:
            keys = {
                'Widest Release': 'widest_release',
                'In Release': 'days_in_release'
            }
            key = keys[label]
            data = data.split()[0]
            results[key] = utils.processNumber(data, [','], 'int')
        elif label == 'Close Date':
            results['close_date'] = utils.processDate("%B %d, %Y", data)
    return results
Esempio n. 2
0
def processDateTokens(tags):
	field=['DATE']
	for key in tags:
		if key in field:
			for i,val in enumerate(tags[key]):
				raw=" ".join(val)
				normDate=utils.processDate(raw)
				tags[key][i]=normDate
	return tags
Esempio n. 3
0
def processDateTokens(tags):
    field = ['DATE']
    for key in tags:
        if key in field:
            for i, val in enumerate(tags[key]):
                raw = " ".join(val)
                normDate = utils.processDate(raw)
                tags[key][i] = normDate
    return tags
Esempio n. 4
0
def scrape_person_chart(soup):
    results = {}
    keys = [
        "title", "studio", "domestic_gross", "lifetime_theaters", "ow_gross",
        "ow_theaters", "date"
    ]
    h1 = soup.find("h1")
    if h1:
        name = h1.text.strip()
        results[name] = {}
        navtabs = soup.select("ul.nav_tabs li")
        for i, li in enumerate(navtabs):
            role = li.text.lower()
            results[name][role] = []
            if i > 0:
                role_url = "https://www.boxofficemojo.com" + li.find(
                    "a")['href']
                soup = utils.download(role_url)
                navtabs = soup.select("ul.nav_tabs li")
                li = navtabs[i]
            table = li.find_next("table")
            rows = table.find_all("tr", {"bgcolor": ["#ffffff", "#f4f4ff"]})
            for r in rows:
                row_dict = {}
                cols = r.select("td")[1:]
                for j, c in enumerate(cols):
                    key = keys[j]
                    data = c.text.strip()
                    if key == 'title':
                        row_dict["movie"] = {}
                        row_dict["movie"]["id"] = utils.get_id(
                            c.find("a")['href'], 'id')
                        row_dict["movie"]["title"] = data
                    elif key == 'studio':
                        row_dict["studio"] = {}
                        row_dict["studio"]["id"] = utils.get_id(
                            c.find("a")['href'], 'studio')
                        row_dict["studio"]["name"] = data
                    else:
                        if data in ["-", "n/a"]:
                            data = None
                        else:
                            if 'gross' in key:
                                data = float(data[1:].replace(',', ''))
                            elif 'theaters' in key:
                                data = int(data.replace(',', ''))
                            else:
                                data = utils.processDate("%m/%d/%y", data)
                        row_dict[key] = data
                results[name][role].append(row_dict)
    return results
Esempio n. 5
0
def get_metadata(soup, results):
    results["title"] = None
    results["metadata"] = {}
    keys = [
        "title", "studio", "release_date", "genre", "runtime_mins",
        "mpaa_rating", "budget"
    ]
    table = soup.find(
        "table", {
            "border": "0",
            "cellpadding": "0",
            "cellspacing": "0",
            "width": "100%",
            "style": "padding-top: 5px;"
        })
    if table:
        extra = table.find("td", {"align": "center", "colspan": "2"})
        if extra:
            extra.decompose()
        metadata_bold = table.find_all("b")
        for i, key in enumerate(keys):
            m = metadata_bold[i]
            metadata = m.text.strip()
            if metadata.lower() in ['n/a', 'tbd']:
                metadata = None
            if key == 'studio':
                url = m.select_one("a")['href']
                studio_id = utils.get_id(url, "studio")
                metadata = {"id": studio_id, "name": metadata}
            elif key == 'budget' and metadata is not None:
                metadata = metadata.replace(' million', '000000')
                metadata = utils.processNumber(metadata, ["$", ","], 'float')
            elif key == 'runtime_mins' and metadata is not None:
                metadata = utils.convert_runtime(metadata)
            elif key == 'release_date' and metadata is not None:
                metadata = utils.processDate("%B %d, %Y", metadata)
            if key == 'title':
                results[key] = metadata
            else:
                results['metadata'][key] = metadata
    return results
Esempio n. 6
0
def scrape_movie_search_data(movie_row):
    keys = [
        "title", "studio", "domestic_gross", "max_theaters", "ow_gross",
        "ow_theaters", "release_date"
    ]
    details = {}
    cols = movie_row.find_all("td")
    for i, k in enumerate(keys):
        col = cols[i].text.strip()
        if col.lower() == 'n/a':
            col = None
        if k == "title":
            url = cols[i].select_one("a")['href']
            movie_id = utils.get_id(url, 'id')
            details["id"] = movie_id
        elif k == 'release_date' and col.lower() not in ['multiple', 'tbd']:
            col = utils.processDate("%m/%d/%Y", col)
        elif k in ['domestic_gross', 'ow_gross'] and col is not None:
            col = utils.processNumber(col, [",", "$"], 'float')
        elif k in ['max_theaters', 'ow_theaters'] and col is not None:
            col = utils.processNumber(col, [","], 'int')
        details[k] = col
    return details
Esempio n. 7
0
def scrape_daily(soup):
    results = {}
    keys = [
        "date", "rank", "gross", "yd_change", "lw_change", "theaters",
        "gross_to_date"
    ]
    container = soup.select_one('table[width="95%"] center')
    if container.text.strip() != "NO DAILY DATA AVAILABLE":
        rows = container.find_all("tr", {"bgcolor": ["#ffffff", "#f4f4ff"]})
        for r in rows:
            all_cols = r.find_all("td")
            cols = []
            for i in range(len(all_cols)):
                if 1 <= i <= 6 or i == 8:
                    cols.append(all_cols[i])
            datekey = None
            for i, c in enumerate(cols):
                key = keys[i]
                data = c.text.strip()
                if key == 'date':
                    datekey = utils.processDate("%b %d, %Y",
                                                data.replace('.', ''))
                    results[datekey] = {}
                else:
                    if data == '-':
                        data = None
                    else:
                        if key in ['yd_change', 'lw_change']:
                            data = round(
                                float(data[:-1].replace(',', '')) / 100, 3)
                        elif key in ['gross', 'gross_to_date']:
                            data = float(data[1:].replace(',', ''))
                        else:
                            data = int(data.replace(',', ''))
                results[datekey][key] = data
    return results
Esempio n. 8
0
rentDataFrame['rentType'] = []  # e.g. by sharing or individual
for eachRow in dataList:

    # use id to avoid duplicated rent request
    try:
        rentDataFrame['id'].append(eachRow['_id'])
    except:
        continue

    try:
        rentDataFrame['price'].append(processPrice(eachRow['jiage']))
    except:
        rentDataFrame['price'].append(None)

    try:
        rentDataFrame['inDate'].append(processDate(eachRow['indate']))
    except:
        rentDataFrame['inDate'].append(None)

    try:
        rentDataFrame['rentArea'].append(eachRow['rentarea'])
    except:
        rentDataFrame['rentArea'].append(None)

    try:
        rentDataFrame['houseType'].append(eachRow['huxings'])
    except:
        rentDataFrame['houseType'].append(None)

    try:
        rentDataFrame['rentType'].append(eachRow['fangshi'][0]['type'])