Beispiel #1
0
    def graph_features(self):
        if not self.graph_features:
            try:
                self.graph_features = SFrame.read_csv(
                    f"{DATA_PATH}/bechdel_features.csv")
            except:
                t = triangles()
                self.graph_features = SFrame.read_csv(
                    "../temp/graph_features.csv")

                self.graph_features = self.graph_features.join(
                    SFrame(get_female_in_top_10_roles()),
                    on={
                        "movie_name": "movie_name",
                        "year": "year"
                    })
                self.graph_features = self.graph_features.join(
                    SFrame(t), on={
                        "movie_name": "movie",
                        "year": "year"
                    })
                self.graph_features["total_tri"] = self.graph_features["0"] + self.graph_features["1"] + \
                                                   self.graph_features["2"] + self.graph_features["3"]
                for i in range(4):
                    self.graph_features[f"{i}%"] = self.graph_features[str(
                        i)] / self.graph_features["total_tri"]

                self.graph_features.save(f"{DATA_PATH}/bechdel_features.csv",
                                         "csv")
        return self.graph_features
 def urls(self):
     """
     Creating URLs SFrame from.txt.gz files
     """
     cols = ["PaperId", "SourceType", "SourceUrl", "LanguageCode"]
     urls = SFrame(
         pd.read_csv(self._dataset_dir / "PaperUrls.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
     return urls.groupby("PaperId", {"Urls": agg.CONCAT("SourceUrl")})
Beispiel #3
0
def _get_sframes(features_train, features_test, labels_train, labels_test):
    logging.debug(f"turi._get_sframes()")

    train_data: pandas.DataFrame = features_train.join(labels_train)
    test_data: pandas.DataFrame = features_test.join(labels_test)

    train_data_sf = SFrame(data=train_data)
    test_data_sf = SFrame(data=test_data)

    return train_data_sf, test_data_sf
Beispiel #4
0
 def __init__(self):
     self.bechdel = SFrame.read_csv(f"{DATA_PATH}/bechdel.csv",
                                    column_type_hints={"imdbid": str})
     self.bechdel.sort("year", False)
     self.bechdel["tconst"] = "tt" + self.bechdel["imdbid"]
     self.bechdel_imdb = imdb_data.title.join(self.bechdel)
     self.clf = RandomForestClassifier(n_jobs=-1,
                                       n_estimators=100,
                                       max_depth=5,
                                       random_state=1)
     self._graph_features = SFrame()
    def fields_of_study_papers_ids(self, levels=(1, 2, 3)):
        """
        Creates SFrames with each Fields of study PaperIds
        :param levels: list of fields of study level

        """

        sf = SFrame()
        for level in tqdm(levels):
            sf = sf.append(self._create_field_of_study_paper_ids(level))
        return sf
 def rating(self):
     if self._rating is None:
         download_file(IMDB_RATING_URL, f"{OUTPUT_PATH}/title.ratings.tsv.gz", False)
         self._rating = SFrame.read_csv(f"{OUTPUT_PATH}/title.ratings.tsv.gz", delimiter="\t", na_values=["\\N"],
                                        verbose=self._verbose)
         self._rating = self._rating.join(self.title)
     return self._rating
Beispiel #7
0
    def data(self):
        """
        Create AMiner Papers sFrame from the AMiner text files.
        After creating the SFrame, it is saved to the disk
        """

        return SFrame.read_json(self._dataset_dir.joinpath("AMiner/*.txt"), orient='lines')
Beispiel #8
0
def get_relationship_triangles():
    triangles = SFrame.read_csv(f"{OUTPUT_PATH}/triangles.csv",
                                usecols=["0", "1", "2", "3", "4"])
    triangles_gender = triangles.apply(lambda x: [
        imdb_data.get_actor_gender(x["0"]),
        imdb_data.get_actor_gender(x["1"]),
        imdb_data.get_actor_gender(x["2"])
    ])
    triangles_gender = triangles_gender.unpack()
    triangles_gender["movie"] = triangles["3"]
    triangles_gender["year"] = triangles["4"]
    triangles_gender = triangles_gender.dropna()
    triangles_gender = triangles_gender.join(imdb_data.title, {
        "movie": "primaryTitle",
        "year": "startYear"
    })

    triangles_gender["1"] = triangles_gender["X.0"] == "M"
    triangles_gender["2"] = triangles_gender["X.1"] == "M"
    triangles_gender["3"] = triangles_gender["X.2"] == "M"
    triangles_gender["total_men"] = triangles_gender["1"] + triangles_gender[
        "2"] + triangles_gender["3"]

    triangles_gender["genres"] = triangles_gender["genres"].apply(
        lambda x: x.split(","))

    return triangles_gender
Beispiel #9
0
    def sjr_to_csv(self, regex):
        sjr_sf = SFrame()
        for p in self._dataset_dir.glob(regex):
            if p.suffix == ".csv":
                y = int(re.match(r'.*([1-3][0-9]{3})', p.name).group(1))
                sf = SFrame.read_csv(str(p), delimiter=';')
                sf['Year'] = y
                sf = sf.rename({"Total Docs. (%s)" % y: "Total Docs."})
                extra_cols = ["Categories"]
                for c in extra_cols:
                    if c not in sf.column_names():
                        sf[c] = ''
                sjr_sf = sjr_sf.append(sf)

        r_issn = re.compile('(\\d{8})')
        sjr_sf['Issn'] = sjr_sf['Issn'].apply(lambda i: r_issn.findall(i))
        return sjr_sf.stack('Issn', new_column_name='ISSN')
Beispiel #10
0
 def __init__(self):
     self.imgframe = tc.load_sframe('model/final/final.sframe')
     self.model = tc.load_model('model/final/final_model')
     self.sample = tc.Image()
     self.results = SFrame()
     self.rows = SArray()
     self.pathlist = []
     self.distance_list = []
 def crew(self):
     if self._crew is None:
         download_file(IMDB_CREW_URL, f"{OUTPUT_PATH}/title.crew.tsv.gz", False)
         self._crew = SFrame.read_csv(f"{OUTPUT_PATH}/title.crew.tsv.gz", delimiter="\t", na_values=["\\N"],
                                      verbose=self._verbose)
         self._crew["directors"] = self.crew["directors"].apply(lambda c: c.split(","))
         self._crew = self._crew.stack("directors", "directors")
     return self._crew
    def popular_actors(self):
        if self._actors is None:
            download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False)
            self._actors = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"],
                                           verbose=self._verbose)
            self._actors = self._actors.filter_by(["actor", "actress"], "category")["tconst", "nconst"]

            self._actors = self._actors.join(
                self.rating[(self.rating["titleType"] == "movie") & (self.rating["numVotes"] > 1000)])
            self._actors = self._actors.groupby("nconst", operations={'averageRating': agg.AVG("averageRating"),
                                                                      'count': agg.COUNT()})
            self._actors = self._actors.sort("averageRating", ascending=False)
            names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t")

            self._actors = self._actors.join(names)
            self._actors["gender"] = self._actors.apply(lambda p: self.add_actor_gender(p))

        return self._actors
 def all_actors(self):
     if self._all_actors is None:
         download_file(IMDB_NAMES_URL, f"{OUTPUT_PATH}/name.basics.tsv.gz", False)
         self._all_actors = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t",
                                            na_values=["\\N"], verbose=self._verbose)
         self._all_actors["primaryProfession"] = self._all_actors["primaryProfession"].apply(lambda x: x.split(","))
         self._all_actors = self._all_actors.stack("primaryProfession", "primaryProfession")
         self._all_actors = self._all_actors.filter_by(["actor", "actress"], "primaryProfession")
         self._all_actors["gender"] = self._all_actors.apply(lambda p: self.add_actor_gender(p))
     return self._all_actors
 def paper_fields_of_study(self):
     """
     Creating Keywords SFrame from.txt.gz files
     """
     cols = ["PaperId", "FieldOfStudyId", "Score"]
     papaers_field = SFrame.read_csv("~/mag/PaperFieldsOfStudy.txt.gz",
                                     header=False,
                                     sep="\t")
     return papaers_field.rename(
         dict(zip([f"X{i+1}" for i in range(len(cols))], cols)))
 def actors_movies(self):
     if self._actors_movies is None:
         download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False)
         self._actors_movies = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t",
                                               na_values=["\\N"], verbose=self._verbose)
         self._actors_movies = self._actors_movies.filter_by(["actor", "actress"], "category")[
             "tconst", "nconst", "characters"]
         self._actors_movies = self._actors_movies.join(self.title[self.title["titleType"] == "movie"])
         self._actors_movies = self._actors_movies.join(self.all_actors)
     return self._actors_movies
 def references(self):
     """Creating the references SFrame from.txt.gz files"""
     references = SFrame.read_csv(str(self._dataset_dir /
                                      "PaperReferences.txt.gz"),
                                  header=False,
                                  delimiter="\t")
     references = references.rename({
         "X1": "PaperId",
         "X2": "PaperReferenceId"
     })
     return references
 def papers_fields_of_study(self):
     """Creating the references SFrame from.txt.gz files"""
     fos = SFrame.read_csv(str(self._dataset_dir /
                               "PapersFieldsOfStudy.txt.gz"),
                           header=False,
                           delimiter="\t")
     return references.rename({
         "X1": "PaperId",
         "X2": "FieldOfStudyId",
         "X3": "Score"
     })
 def field_of_study_children(self):
     """
     Creates field of study hierarchy sframe from.txt.gz files
     """
     h_sf = SFrame.read_csv(str(self._dataset_dir /
                                "FieldOfStudyChildren.txt.gz"),
                            header=False,
                            delimiter="\t")
     return h_sf.rename({
         "X1": "FieldOfStudyId",
         "X2": "ChildFieldOfStudyId"
     })
 def fields_of_study(self):
     """
     Creating Field of study SFrame from.txt.gz files
     """
     cols = [
         "FieldOfStudyId", "Rank", "NormalizedName", "DisplayName",
         "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate"
     ]
     fields_of_study = SFrame(
         pd.read_csv(self._dataset_dir / "FieldsOfStudy.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
     return fields_of_study
 def paper_resources(self):
     """
     Creating Field of study SFrame from.txt.gz files
     ResourceType. 1 = Project, 2 = Data, 4 = Code
     """
     cols = [
         "PaperId", "ResourceType", "ResourceUrl", "SourceUrl",
         "RelationshipType"
     ]
     return SFrame(
         pd.read_csv(self._dataset_dir / "PaperResources.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
Beispiel #21
0
def generate_blacklist_roles():
    firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv",
                                 verbose=False)["Name"]
    surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv",
                                verbose=False)["name"]
    surenames = surenames.apply(lambda n: n.title())
    sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz",
                         delimiter="\t",
                         column_type_hints={"characters": list},
                         na_values=["\\N"])
    sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering",
                                                        "characters", "nconst"]
    sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"])
    sf = sf.stack("characters", "character")
    sf["character"] = sf["character"].apply(lambda c: c.title())
    sf.export_csv(f"{TEMP_PATH}/roles3.csv")

    whitelist = sf.groupby(key_column_names=['character', "nconst"],
                           operations={'count': agg.COUNT()})
    whitelist = whitelist[whitelist["count"] > 1]['character']
    sf = sf.filter_by(whitelist, "character", True)
    sf = sf.groupby(key_column_names=['character'],
                    operations={
                        'ordering': agg.AVG("ordering"),
                        'count': agg.COUNT()
                    })
    sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip())
    sf = sf.filter_by(names.words(), "name", exclude=True)
    sf = sf.filter_by(surenames, "name", exclude=True)
    sf = sf.filter_by(firstnames, "name", exclude=True)
    sf = sf.sort("count", False)
    sf = sf[sf['ordering'] > 3]
    w = {x.replace("_", " ").title()
         for x in wordnet.words()} - set(names.words())
    sf["set"] = sf["character"].apply(lambda x: x.split(" "))
    sf["set"] = sf["set"].apply(lambda x: w & set(x))
    sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10)
                                        & (sf["set"] != [])])
    sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
def createFrame(file):
    frame = SFrame.read_csv(args.labels + '/' + file,
                            delimiter=' ',
                            header=False)
    frame = frame.rename({
        'X1': 'name',
        'X2': 'xMin',
        'X3': 'yMin',
        'X4': 'xMax',
        'X5': 'yMax'
    })
    frame['image'] = os.path.splitext(file)[0]
    return frame
 def journals(self):
     """
     Create the Papers SFrame object from.txt.gz files which contains information on each paper
     """
     cols = [
         "JournalId", "Rank", "NormalizedName", "DisplayName", "Issn",
         "Publisher", "Webpage", "PaperCount", "CitationCount",
         "CreatedDate"
     ]
     journals = SFrame(
         pd.read_csv(self._dataset_dir / "Journals.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
     return journals
    def paper_author_affiliations(self):
        """
        Creating authors affiliation SFrame from.txt.gz files
        :return:
        """
        cols = [
            "PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber",
            "OriginalAuthor", "OriginalAffiliation"
        ]
        paper_author_affiliations = SFrame(
            pd.read_csv(self._dataset_dir / "PaperAuthorAffiliations.txt.gz",
                        sep="\t",
                        names=cols).replace({pd.NA: None}))

        return paper_author_affiliations
    def get_user_preferences(user_id, self):
        pref_df = self.get_data_frame('pref')
        comb_df = self.get_data_frame('comb')

        ratings_frame = SFrame(pd.merge(pref_df, comb_df, on='combination_id'))

        item_sim_model = item_similarity_recommender.create(
            ratings_frame,
            user_id='user_id',
            item_id='combination_id',
            target='rating',
            similarity_type='cosine')

        return item_sim_model.recommend(users=[].append(user_id),
                                        k=10).to_dataframe().values
Beispiel #26
0
def create_and_save_model(data: turicreate.SFrame):
    """
    Creates the CoreML model using the data SFrame, and saves it to the current directory.

    :param data: The SFrame that was created using the training data
    """
    train_data, test_data = data.random_split(TRAIN_TEST_SPLIT)
    model = turicreate.one_shot_object_detector.create(train_data,
                                                       target=CARD_NAME_LABEL,
                                                       batch_size=32)
    _ = model.predict(test_data)
    print("Ran model.predict")
    metrics = model.evaluate(test_data)
    print(metrics[ACCURACY_LABEL])
    model.save(MODEL_NAME)
    model.export_coreml(COREML_MODEL_NAME)
    def affiliations(self):
        """
        Creating authors affiliation SFrame from.txt.gz files
        :return:
        """
        cols = [
            "AffiliationId", "Rank", "NormalizedName", "DisplayName", "GridId",
            "OfficialPage", "WikiPage", "PaperCount", "CitationCount",
            "CreatedDate"
        ]
        affiliations = SFrame(
            pd.read_csv(self._dataset_dir / "Affiliations.txt.gz",
                        sep="\t",
                        names=cols).replace({pd.NA: None}))

        return affiliations
    def get_directors_data(self):

        rating = self.rating[self.rating["numVotes"] > 10000]

        sf = self.crew.join(rating)

        title = self.title[self.title["titleType"] == "movie"]
        sf = sf.join(title)
        sf = sf.groupby(key_column_names='directors',
                        operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()})

        sf = sf[sf["count"] > 5]

        names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t")
        sf = sf.join(names, {"directors": "nconst"})
        return sf.sort("averageRating", ascending=False)
 def authors(self):
     """
     Creates authors names SFrames from.txt.gz files
     """
     authors = SFrame(
         pd.read_csv(self._dataset_dir / "Authors.txt.gz",
                     sep="\t",
                     names=[
                         "AuthorId", "Rank", "NormalizedName",
                         "DisplayName", "LastKnownAffiliationId",
                         "PaperCount", "CitationCount", "CreatedDate"
                     ]).replace({pd.NA: None}))
     authors['First name'] = authors['NormalizedName'].apply(
         lambda s: s.split()[0])
     authors['Last name'] = authors['NormalizedName'].apply(
         lambda s: s.split()[-1])
     return authors
 def papers(self):
     """
     Create the Papers SFrame object from.txt.gz files which contains information on each paper
     """
     cols = [
         "PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle",
         "BookTitle", "Year", "Date", "Publisher", "JournalId",
         "ConferenceSeriesId", "ConferenceInstanceId", "Volume", "Issue",
         "FirstPage", "LastPage", "ReferenceCount", "CitationCount",
         "EstimatedCitation", "OriginalVenue", "CreatedDate"
     ]
     papers = SFrame.read_csv(str(self._dataset_dir / "Papers.txt.gz"),
                              header=False,
                              sep="\t")
     papers = papers.rename(
         dict(zip([f"X{i+1}" for i in range(len(cols))], cols)))
     papers["Year"] = papers["Year"].astype(int)
     return papers