Ejemplo n.º 1
0
    def execute(self, data):
        rt = RT.RT()
        movie = data["movie"]
        Ebert = False
        pred = False
        proba = 0
        count, title, date_string, fresh_score, id_list = le.get_movie(rt, str(movie))

        if count > 0:
            total, last_page, review_list = le.get_reviews(rt, id_list[0], fresh_score)
            entries, A = le.build_matrix(review_list, id_list, critic_list, fillzeros=True)
            if entries > 0:
                X_test, y_test = A[:, :-1], A[:, -1]
                Ebert = y_test[0] > 0
                pred = RF.predict(X_test)[0] == 1
                proba = RF.predict_proba(X_test)[0, 1]

        result = []
        result.append({"Count": count})
        result.append({"info": title + " " + date_string})
        result.append({"EbertReviewed": Ebert})
        result.append({"great": pred})
        result.append({"prob": proba})

        return result
Ejemplo n.º 2
0
def scrape_reviews(df, collection, starting_row=0):
    '''
    Iterate over movie ids in dataframe and write reviews to a Mongo collection
    Dataframe will be updated in place with progress on the scraping
    Input/Output:   df (Pandas Dataframe)
                    collection (Mongo collection)
                    starting_row - if restarting set to first unscraped row (int)
    '''
    rt = RT.RT()
    for index, row in df[starting_row:].iterrows():
        if index%50 == 0:
            print 'Scraping movie:', index
        m_id = df['movie_id'][index]
        m_fresh = df['freshness'][index]
        m_fresh = m_fresh.strip().strip('%')
        total_stated, final_page, review_list = le.get_reviews(rt, m_id, m_fresh)
        add_to_mongo(review_list, collection)
        total_items = len(review_list)
        if total_stated < total_items-1:
            print 'For', m_id, 'found', total_items, 'out of', total_stated
        df['total'][index] = total_stated
        df['found'][index] = total_items
        df['pages'][index] = final_page