def funnyshortjokes_test(): joke_parser = RawFunnyShortJokesJokeReader() pipeline = Pipeline() pipeline.add(Clean()) pipeline.add(AddNouns()) pipeline.add(Lowercase()) pipeline.add(AddGloveEmbeddings()) for file in get_raw_funnyshortjokes_joke_files(): filename = file.split("/")[-1] output = os.path.join(get_project_data_path(), "funnyshortjokes_processed", filename) writer = ProcessedFunnyShortJokesJokeWriter() jokes = joke_parser.read(file) jokes = (pipeline.process(joke) for joke in jokes) writer.write(jokes, output) reader = ProcessedFunnyShortJokesJokeReader() jokes = reader.read(output) # jokes = list(filter(lambda joke: joke is not None, jokes)) for joke in jokes: print(joke) print("Nouns:", joke.nouns_) print("Embeddings:", joke.embeddings_) # print(jokes) break
def spider_closed(self, reason): filename_template = os.path.join(get_project_data_path(), "funnyshortjokes_raw", "%s.json") for category, jokes in self.jokes.items(): filename = filename_template % category.replace(" ", "_") with open(filename, "w", encoding="utf-8") as outfile: json.dump(jokes, outfile, indent=4, sort_keys=True)
def _write_jokes_to_file(self, jokes, filename): joke_id = 1 jokes_dict = {} for id, premise, punchline, subreddit in jokes: jokes_dict[joke_id] = {"id": id, "premise": premise, "punchline": punchline, "subreddit": subreddit} joke_id += 1 file = os.path.join(get_project_data_path(), "reddit_raw", filename) with open(file, "w", encoding="utf-8") as outfile: json.dump(jokes_dict, outfile, indent=4, sort_keys=True)
def start_requests(self): data_directory = os.path.join(get_project_data_path(), "funnyshortjokes_raw") parsed_categories = { category.split(".")[0].replace("_", " ") for category in os.listdir(data_directory) } urls = [ self.base_url + "/c/" + category.lower().replace(" ", "-") for category in self.categories if category not in parsed_categories ] for url in urls: yield scrapy.Request(url=url, callback=self.parse)
def run_joke_scraper(): base_url = "https://www.reddit.com/r/" subreddits = ["Jokes", "DirtyJokes", "cleanjokes", "AntiJokes", "Antihumor", "darkjokes", "MeanJokes", "AntiAntiJokes", "dadjokes", "ProgrammerHumor", "MathJokes", "MommaJokes", "3amjokes", "ShortCleanFunny", "badjokes", "deadbabyjokes", "DarkHumor", "Punny", "pun", "ScienceJokes", "chemistryjokes", "intellectualdadjokes", "ProgrammerDadJokes", "nsfwdadjokes", "dadjokesinhistory", "Hearthstonedadjokes", "dadsouls", "warcraftdadjokes", "dota2dadjokes", "DestinyDadJokes", "FFXIVDadjokes", "Falloutdadjokes", "DMDadJokes", "skyrimdadjokes", "OverwatchDadjokes", "DarkDadJokes", "CivDadJokes", "TrahearneJokes", "StarWarsDadJokes", "eu4dadjokes", "shubreddit", "momjokes"] subreddits_scraped = {filename.split(".")[0] for filename in os.listdir(os.path.join(get_project_data_path(), "reddit_raw"))} start_urls = [base_url + subreddit for subreddit in subreddits if subreddit not in subreddits_scraped] scraper = RedditJokeScraper(subreddits, 1000) jokes = Parallel(n_jobs=1)(delayed(scraper.scrape)(start_url) for start_url in start_urls)
def main(): id_to_category = { 1: "funny_chuck_norris_jokes", 2: "funny_yo_momma_jokes", 3: "funny_blonde_jokes", 4: "funny_one_liner_jokes", 5: "funny_short_jokes", 6: "funny_long_jokes", 7: "funny_redneck_jokes", 9: "funny_dirty_jokes", 10: "funny_racial_jokes", 12: "funny_comebacks", 14: "funny_pick_up_lines", 15: "funny_celebrity_jokes", 16: "funny_anti_humor_jokes", 17: "funny_animal_jokes", 18: "funny_puns" } scraped_categories = { category.split(".")[0] for category in os.listdir( os.path.join(get_project_data_path(), "kickasshumor_raw")) } category_ids = [ id for id, category in id_to_category.items() if category not in scraped_categories ] base_url = "https://www.kickasshumor.com/c/%i" urls = [base_url % id for id in category_ids] for url in urls: scraper = KickassHumorJokeScraper() jokes = scraper.scrape(url) print(len(jokes))
def get_raw_reddit_joke_files(): directory = os.path.join(get_project_data_path(), "reddit_raw") for file in _get_joke_files(directory): yield file
def get_raw_kickasshumor_joke_files(): directory = os.path.join(get_project_data_path(), "kickasshumor_raw") for file in _get_joke_files(directory): yield file
def get_raw_funnyshortjokes_joke_files(): directory = os.path.join(get_project_data_path(), "funnyshortjokes_raw") for file in _get_joke_files(directory): yield file
def _write_jokes_to_file(self): file = os.path.join(get_project_data_path(), "kickasshumor_raw", "%s.json" % self.category.replace("-", "_")) with open(file, "w", encoding="utf-8") as outfile: json.dump(self.jokes, outfile, indent=4, sort_keys=True)