Esempio n. 1
0
    def graph_features(self):
        if not self.graph_features:
            try:
                self.graph_features = SFrame.read_csv(
                    f"{DATA_PATH}/bechdel_features.csv")
            except:
                t = triangles()
                self.graph_features = SFrame.read_csv(
                    "../temp/graph_features.csv")

                self.graph_features = self.graph_features.join(
                    SFrame(get_female_in_top_10_roles()),
                    on={
                        "movie_name": "movie_name",
                        "year": "year"
                    })
                self.graph_features = self.graph_features.join(
                    SFrame(t), on={
                        "movie_name": "movie",
                        "year": "year"
                    })
                self.graph_features["total_tri"] = self.graph_features["0"] + self.graph_features["1"] + \
                                                   self.graph_features["2"] + self.graph_features["3"]
                for i in range(4):
                    self.graph_features[f"{i}%"] = self.graph_features[str(
                        i)] / self.graph_features["total_tri"]

                self.graph_features.save(f"{DATA_PATH}/bechdel_features.csv",
                                         "csv")
        return self.graph_features
Esempio n. 2
0
 def rating(self):
     if self._rating is None:
         download_file(IMDB_RATING_URL, f"{OUTPUT_PATH}/title.ratings.tsv.gz", False)
         self._rating = SFrame.read_csv(f"{OUTPUT_PATH}/title.ratings.tsv.gz", delimiter="\t", na_values=["\\N"],
                                        verbose=self._verbose)
         self._rating = self._rating.join(self.title)
     return self._rating
Esempio n. 3
0
def get_relationship_triangles():
    triangles = SFrame.read_csv(f"{OUTPUT_PATH}/triangles.csv",
                                usecols=["0", "1", "2", "3", "4"])
    triangles_gender = triangles.apply(lambda x: [
        imdb_data.get_actor_gender(x["0"]),
        imdb_data.get_actor_gender(x["1"]),
        imdb_data.get_actor_gender(x["2"])
    ])
    triangles_gender = triangles_gender.unpack()
    triangles_gender["movie"] = triangles["3"]
    triangles_gender["year"] = triangles["4"]
    triangles_gender = triangles_gender.dropna()
    triangles_gender = triangles_gender.join(imdb_data.title, {
        "movie": "primaryTitle",
        "year": "startYear"
    })

    triangles_gender["1"] = triangles_gender["X.0"] == "M"
    triangles_gender["2"] = triangles_gender["X.1"] == "M"
    triangles_gender["3"] = triangles_gender["X.2"] == "M"
    triangles_gender["total_men"] = triangles_gender["1"] + triangles_gender[
        "2"] + triangles_gender["3"]

    triangles_gender["genres"] = triangles_gender["genres"].apply(
        lambda x: x.split(","))

    return triangles_gender
Esempio n. 4
0
 def crew(self):
     if self._crew is None:
         download_file(IMDB_CREW_URL, f"{OUTPUT_PATH}/title.crew.tsv.gz", False)
         self._crew = SFrame.read_csv(f"{OUTPUT_PATH}/title.crew.tsv.gz", delimiter="\t", na_values=["\\N"],
                                      verbose=self._verbose)
         self._crew["directors"] = self.crew["directors"].apply(lambda c: c.split(","))
         self._crew = self._crew.stack("directors", "directors")
     return self._crew
Esempio n. 5
0
    def popular_actors(self):
        if self._actors is None:
            download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False)
            self._actors = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"],
                                           verbose=self._verbose)
            self._actors = self._actors.filter_by(["actor", "actress"], "category")["tconst", "nconst"]

            self._actors = self._actors.join(
                self.rating[(self.rating["titleType"] == "movie") & (self.rating["numVotes"] > 1000)])
            self._actors = self._actors.groupby("nconst", operations={'averageRating': agg.AVG("averageRating"),
                                                                      'count': agg.COUNT()})
            self._actors = self._actors.sort("averageRating", ascending=False)
            names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t")

            self._actors = self._actors.join(names)
            self._actors["gender"] = self._actors.apply(lambda p: self.add_actor_gender(p))

        return self._actors
Esempio n. 6
0
 def actors_movies(self):
     if self._actors_movies is None:
         download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False)
         self._actors_movies = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t",
                                               na_values=["\\N"], verbose=self._verbose)
         self._actors_movies = self._actors_movies.filter_by(["actor", "actress"], "category")[
             "tconst", "nconst", "characters"]
         self._actors_movies = self._actors_movies.join(self.title[self.title["titleType"] == "movie"])
         self._actors_movies = self._actors_movies.join(self.all_actors)
     return self._actors_movies
 def paper_fields_of_study(self):
     """
     Creating Keywords SFrame from.txt.gz files
     """
     cols = ["PaperId", "FieldOfStudyId", "Score"]
     papaers_field = SFrame.read_csv("~/mag/PaperFieldsOfStudy.txt.gz",
                                     header=False,
                                     sep="\t")
     return papaers_field.rename(
         dict(zip([f"X{i+1}" for i in range(len(cols))], cols)))
Esempio n. 8
0
 def all_actors(self):
     if self._all_actors is None:
         download_file(IMDB_NAMES_URL, f"{OUTPUT_PATH}/name.basics.tsv.gz", False)
         self._all_actors = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t",
                                            na_values=["\\N"], verbose=self._verbose)
         self._all_actors["primaryProfession"] = self._all_actors["primaryProfession"].apply(lambda x: x.split(","))
         self._all_actors = self._all_actors.stack("primaryProfession", "primaryProfession")
         self._all_actors = self._all_actors.filter_by(["actor", "actress"], "primaryProfession")
         self._all_actors["gender"] = self._all_actors.apply(lambda p: self.add_actor_gender(p))
     return self._all_actors
 def references(self):
     """Creating the references SFrame from.txt.gz files"""
     references = SFrame.read_csv(str(self._dataset_dir /
                                      "PaperReferences.txt.gz"),
                                  header=False,
                                  delimiter="\t")
     references = references.rename({
         "X1": "PaperId",
         "X2": "PaperReferenceId"
     })
     return references
 def papers_fields_of_study(self):
     """Creating the references SFrame from.txt.gz files"""
     fos = SFrame.read_csv(str(self._dataset_dir /
                               "PapersFieldsOfStudy.txt.gz"),
                           header=False,
                           delimiter="\t")
     return references.rename({
         "X1": "PaperId",
         "X2": "FieldOfStudyId",
         "X3": "Score"
     })
Esempio n. 11
0
 def __init__(self):
     self.bechdel = SFrame.read_csv(f"{DATA_PATH}/bechdel.csv",
                                    column_type_hints={"imdbid": str})
     self.bechdel.sort("year", False)
     self.bechdel["tconst"] = "tt" + self.bechdel["imdbid"]
     self.bechdel_imdb = imdb_data.title.join(self.bechdel)
     self.clf = RandomForestClassifier(n_jobs=-1,
                                       n_estimators=100,
                                       max_depth=5,
                                       random_state=1)
     self._graph_features = SFrame()
 def field_of_study_children(self):
     """
     Creates field of study hierarchy sframe from.txt.gz files
     """
     h_sf = SFrame.read_csv(str(self._dataset_dir /
                                "FieldOfStudyChildren.txt.gz"),
                            header=False,
                            delimiter="\t")
     return h_sf.rename({
         "X1": "FieldOfStudyId",
         "X2": "ChildFieldOfStudyId"
     })
Esempio n. 13
0
def generate_blacklist_roles():
    firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv",
                                 verbose=False)["Name"]
    surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv",
                                verbose=False)["name"]
    surenames = surenames.apply(lambda n: n.title())
    sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz",
                         delimiter="\t",
                         column_type_hints={"characters": list},
                         na_values=["\\N"])
    sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering",
                                                        "characters", "nconst"]
    sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"])
    sf = sf.stack("characters", "character")
    sf["character"] = sf["character"].apply(lambda c: c.title())
    sf.export_csv(f"{TEMP_PATH}/roles3.csv")

    whitelist = sf.groupby(key_column_names=['character', "nconst"],
                           operations={'count': agg.COUNT()})
    whitelist = whitelist[whitelist["count"] > 1]['character']
    sf = sf.filter_by(whitelist, "character", True)
    sf = sf.groupby(key_column_names=['character'],
                    operations={
                        'ordering': agg.AVG("ordering"),
                        'count': agg.COUNT()
                    })
    sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip())
    sf = sf.filter_by(names.words(), "name", exclude=True)
    sf = sf.filter_by(surenames, "name", exclude=True)
    sf = sf.filter_by(firstnames, "name", exclude=True)
    sf = sf.sort("count", False)
    sf = sf[sf['ordering'] > 3]
    w = {x.replace("_", " ").title()
         for x in wordnet.words()} - set(names.words())
    sf["set"] = sf["character"].apply(lambda x: x.split(" "))
    sf["set"] = sf["set"].apply(lambda x: w & set(x))
    sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10)
                                        & (sf["set"] != [])])
    sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
def createFrame(file):
    frame = SFrame.read_csv(args.labels + '/' + file,
                            delimiter=' ',
                            header=False)
    frame = frame.rename({
        'X1': 'name',
        'X2': 'xMin',
        'X3': 'yMin',
        'X4': 'xMax',
        'X5': 'yMax'
    })
    frame['image'] = os.path.splitext(file)[0]
    return frame
Esempio n. 15
0
    def get_directors_data(self):

        rating = self.rating[self.rating["numVotes"] > 10000]

        sf = self.crew.join(rating)

        title = self.title[self.title["titleType"] == "movie"]
        sf = sf.join(title)
        sf = sf.groupby(key_column_names='directors',
                        operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()})

        sf = sf[sf["count"] > 5]

        names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t")
        sf = sf.join(names, {"directors": "nconst"})
        return sf.sort("averageRating", ascending=False)
Esempio n. 16
0
    def sjr_to_csv(self, regex):
        sjr_sf = SFrame()
        for p in self._dataset_dir.glob(regex):
            if p.suffix == ".csv":
                y = int(re.match(r'.*([1-3][0-9]{3})', p.name).group(1))
                sf = SFrame.read_csv(str(p), delimiter=';')
                sf['Year'] = y
                sf = sf.rename({"Total Docs. (%s)" % y: "Total Docs."})
                extra_cols = ["Categories"]
                for c in extra_cols:
                    if c not in sf.column_names():
                        sf[c] = ''
                sjr_sf = sjr_sf.append(sf)

        r_issn = re.compile('(\\d{8})')
        sjr_sf['Issn'] = sjr_sf['Issn'].apply(lambda i: r_issn.findall(i))
        return sjr_sf.stack('Issn', new_column_name='ISSN')
 def papers(self):
     """
     Create the Papers SFrame object from.txt.gz files which contains information on each paper
     """
     cols = [
         "PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle",
         "BookTitle", "Year", "Date", "Publisher", "JournalId",
         "ConferenceSeriesId", "ConferenceInstanceId", "Volume", "Issue",
         "FirstPage", "LastPage", "ReferenceCount", "CitationCount",
         "EstimatedCitation", "OriginalVenue", "CreatedDate"
     ]
     papers = SFrame.read_csv(str(self._dataset_dir / "Papers.txt.gz"),
                              header=False,
                              sep="\t")
     papers = papers.rename(
         dict(zip([f"X{i+1}" for i in range(len(cols))], cols)))
     papers["Year"] = papers["Year"].astype(int)
     return papers
Esempio n. 18
0
def get_bechdel_movies():
    movies = SFrame.read_csv(f"{DATA_PATH}/bechdel_imdb.csv")
    movies = movies.sort("year", False)
    movies = movies.filter_by("movie", "titleType")
    generate_movies_graphs(movies)
Esempio n. 19
0
zwsid = <REDACTED ADD YOUR OWN KEY>


key = zwsid
api = zillow.ValuationApi()

def getSearchResults(key,row):
    try:
        address = row['ADDRESS'].strip()
        zipCode = row['ZIP CODE']
        data = api.GetDeepSearchResults(key,address,zipCode)
        return data.get_dict()
    except:
        pass

sf = SFrame.read_csv('trulia11566.csv', verbose=False)
sf['zillowData'] = sf.apply(lambda row: getSearchResults(key,row))

sf = sf.unpack('zillowData').unpack('zillowData.zestimate')


sf['ADDRESS',
   'LOCALITY',
   'STATE',
   'ZIP CODE',
   'COUNTY',
   'STREET',
   'TYPE',
   'PRICE',
   'zillowData.zestimate.amount',
   'zillowData.zestimate.valuation_range_high',
Esempio n. 20
0

def row_to_bbox_coordinates(row):
    """
    Takes a row and returns a dictionary representing bounding
    box coordinates:  (center_x, center_y, width, height)  e.g. {'x': 100, 'y': 120, 'width': 80, 'height': 120}
    """
    return {
        'x': row['xMin'] + (row['xMax'] - row['xMin']) / 2,
        'y': row['yMin'] + (row['yMax'] - row['yMin']) / 2,
        'width': (row['xMax'] - row['xMin']),
        'height': (row['yMax'] - row['yMin'])
    }


sf = SFrame.read_csv(args.input)

# rename columns to the input required for create ml
sf = sf.rename({'name': 'label', 'image': 'imagefilename'})

# convert coordinates system to origin and size
sf['coordinates'] = sf.apply(row_to_bbox_coordinates)

# delete unused columns
del sf['xMin'], sf['xMax'], sf['yMin'], sf['yMax'], sf['id']

# nest columns into a new column
sf = sf.pack_columns(['label', 'coordinates'],
                     new_column_name='bbox',
                     dtype=dict)
Esempio n. 21
0
 def title(self):
     if self._title is None:
         download_file(IMDB_TITLES_URL, f"{OUTPUT_PATH}/title.basics.tsv.gz", False)
         self._title = SFrame.read_csv(f"{OUTPUT_PATH}/title.basics.tsv.gz", delimiter="\t", na_values=["\\N"],
                                       verbose=self._verbose)
     return self._title