class ThreadScraper:
    def __init__(self, url):
        self.res_dict = {
            'Title': [],
            'Content': [],
            'Title + Content': [],
            'URL': [],
            'ID': []
        }
        self.url = url
        self.sg = SearchGenerator(self.url)
        self.search_terms = np.asarray(self.sg.extract_keywords())
        self.df = ''
        self.cleaner = Cleaner()

    def get_submissions(self, term):
        submissions = self.sg.get_reddit().subreddit(
            str(self.sg.get_subreddit())).search(term[0],
                                                 time_filter='year',
                                                 syntax='plain')
        for sub in submissions:
            title = sub.title
            content = sub.selftext
            url = sub.url
            id = sub.id
            if not (url.endswith(".jpg")) and not (
                    url.endswith(".png")
            ) and not (url.endswith(".gif")) and len(content) > 50 and (
                    'http' not in content) and (id not in self.res_dict['ID']):
                self.res_dict['Title'].append(
                    self.cleaner.clean_text(title).split())
                self.res_dict['Content'].append(
                    self.cleaner.clean_text(content).split())
                self.res_dict['Title + Content'].append(
                    self.cleaner.clean_text(title + ' ' + content).split())
                self.res_dict['URL'].append(url)
                self.res_dict['ID'].append(id)

    def export_submission(self):
        with concurrent.futures.ThreadPoolExecutor(8) as executor:
            executor.map(self.get_submissions, self.search_terms)
        df = pd.DataFrame(self.res_dict)
        df.dropna(inplace=True)
        df.reset_index()
        self.df = df
        if not os.path.exists('data'):
            os.makedirs('data')
        print("Writing to CSV")
        df.to_csv('data/results.csv')
        print("Done...")
        return df
class Content:
    def __init__(self, df, url):
        self.df = df
        self.cleaner = Cleaner()

    def clean_frame(self):
        self.df = self.df[[
            'Title', 'Content'
        ]].apply(lambda x: self.cleaner.clean_text(x).split())
 def get_cleantext(self, text):
     cleaner = Cleaner()
     cleaned = cleaner.clean_text(text)
     return cleaned
Esempio n. 4
0
        data = json.load(f)
        user_values = data[json_key]
        reddit = praw.Reddit(client_id=user_values['client_id'],
                             client_secret=user_values['client_secret'],
                             user_agent=user_values['user_agent'],
                             username=user_values['username'],
                             password=user_values['password'])
        return reddit


reddit = create_reddit_object()

url = "https://www.reddit.com/r/uwaterloo/comments/h9874q/is_it_really_a_sunday_unless_you_waste_the_day/"
submission = reddit.submission(url=url)

sublist = submission.selftext
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    sublist += ' ' + comment.body

cleaner= Cleaner()
sublist= cleaner.clean_text(sublist)
simple_kwextractor = yake.KeywordExtractor(n=2)
keywords = simple_kwextractor.extract_keywords(sublist)
#for kw in keywords:
    #print(kw)

simple_kwextractor = yake.KeywordExtractor(n=2)
keywords = simple_kwextractor.extract_keywords(cleaner.clean_text(submission.title + ' ' + sublist))
for kw in keywords:
    print(kw)