class ThreadScraper: def __init__(self, url): self.res_dict = { 'Title': [], 'Content': [], 'Title + Content': [], 'URL': [], 'ID': [] } self.url = url self.sg = SearchGenerator(self.url) self.search_terms = np.asarray(self.sg.extract_keywords()) self.df = '' self.cleaner = Cleaner() def get_submissions(self, term): submissions = self.sg.get_reddit().subreddit( str(self.sg.get_subreddit())).search(term[0], time_filter='year', syntax='plain') for sub in submissions: title = sub.title content = sub.selftext url = sub.url id = sub.id if not (url.endswith(".jpg")) and not ( url.endswith(".png") ) and not (url.endswith(".gif")) and len(content) > 50 and ( 'http' not in content) and (id not in self.res_dict['ID']): self.res_dict['Title'].append( self.cleaner.clean_text(title).split()) self.res_dict['Content'].append( self.cleaner.clean_text(content).split()) self.res_dict['Title + Content'].append( self.cleaner.clean_text(title + ' ' + content).split()) self.res_dict['URL'].append(url) self.res_dict['ID'].append(id) def export_submission(self): with concurrent.futures.ThreadPoolExecutor(8) as executor: executor.map(self.get_submissions, self.search_terms) df = pd.DataFrame(self.res_dict) df.dropna(inplace=True) df.reset_index() self.df = df if not os.path.exists('data'): os.makedirs('data') print("Writing to CSV") df.to_csv('data/results.csv') print("Done...") return df
class Content: def __init__(self, df, url): self.df = df self.cleaner = Cleaner() def clean_frame(self): self.df = self.df[[ 'Title', 'Content' ]].apply(lambda x: self.cleaner.clean_text(x).split())
def get_cleantext(self, text): cleaner = Cleaner() cleaned = cleaner.clean_text(text) return cleaned
data = json.load(f) user_values = data[json_key] reddit = praw.Reddit(client_id=user_values['client_id'], client_secret=user_values['client_secret'], user_agent=user_values['user_agent'], username=user_values['username'], password=user_values['password']) return reddit reddit = create_reddit_object() url = "https://www.reddit.com/r/uwaterloo/comments/h9874q/is_it_really_a_sunday_unless_you_waste_the_day/" submission = reddit.submission(url=url) sublist = submission.selftext submission.comments.replace_more(limit=None) for comment in submission.comments.list(): sublist += ' ' + comment.body cleaner= Cleaner() sublist= cleaner.clean_text(sublist) simple_kwextractor = yake.KeywordExtractor(n=2) keywords = simple_kwextractor.extract_keywords(sublist) #for kw in keywords: #print(kw) simple_kwextractor = yake.KeywordExtractor(n=2) keywords = simple_kwextractor.extract_keywords(cleaner.clean_text(submission.title + ' ' + sublist)) for kw in keywords: print(kw)