def read_movie_features(titles=False, genres=False, genome_tag_threshold=1.0, tag_popularity_threshold=30): features = Features() with open(os.path.join(DATA_DIR, "movies.dat"), "r") as moviefile: for line in moviefile: (iid, title, genre_list) = line.split(SEPARATOR) genres_list = genre_list.split("|") features.add_item(iid) if genres: for genre in genres_list: features.add_feature(iid, "genre:" + genre.lower().replace("\n", "")) if titles: features.add_feature(iid, "title:" + title.lower()) features.add_title(iid, title) for iid, tag, relevance in read_genome_tags(): # Do not include any tags for movies not in the 10M dataset if relevance >= genome_tag_threshold and iid in features.item_ids: features.add_feature(iid, "genome:" + tag.lower()) # Tags applied by users ## for iid, tag, count in read_tags(): ## if count >= tag_popularity_threshold and iid in features.item_ids: ## features.add_feature(iid, 'tag:' + tag) features.set_shape() return features
def read_movie_features(titles=False, genres=False, genome_tag_threshold=1.0, tag_popularity_threshold=30): features = Features() with open(os.path.join(DATA_DIR, 'movies.dat'), 'r') as moviefile: for line in moviefile: (iid, title, genre_list) = line.split(SEPARATOR) genres_list = genre_list.split('|') features.add_item(iid) if genres: for genre in genres_list: features.add_feature( iid, 'genre:' + genre.lower().replace('\n', '')) if titles: features.add_feature(iid, 'title:' + title.lower()) features.add_title(iid, title) for iid, tag, relevance in read_genome_tags(): # Do not include any tags for movies not in the 10M dataset if relevance >= genome_tag_threshold and iid in features.item_ids: features.add_feature(iid, 'genome:' + tag.lower()) # Tags applied by users ## for iid, tag, count in read_tags(): ## if count >= tag_popularity_threshold and iid in features.item_ids: ## features.add_feature(iid, 'tag:' + tag) features.set_shape() return features
def read_post_features(tags, post_ids, post_text): features = Features() for post_id, user_id, post_tags, parent_post_id, body in read_post_data(): # Only get features for questions, not answers. if parent_post_id is None: features.add_item(post_id) if post_ids: features.add_feature(post_id, 'post_id:' + post_id) if tags: for tag in post_tags: features.add_feature(post_id, 'tag:' + tag) if post_text: for token in body: features.add_feature(post_id, 'body:' + token) features.set_shape() return features