Exemple #1
0
class Loader(Thread, Logger, Store):
    def __init__(self, name, root, config, subreddit):
        Thread.__init__(self, name=name)
        Logger.__init__(self, name=name, context=f'r/{subreddit}', plain=False)
        Store.__init__(self, name=name, root=root, config=config, subreddit=subreddit)

        # thread events
        self.runevent = Event()
        self.stopevent = Event()

        # time helpers
        self.time = Sleep(10, immediate=False)

    def running(self):
        return self.runevent.is_set()

    def stopped(self):
        return self.stopevent.is_set()

    def alive(self):
        return self.is_alive()

    def run(self):
        raise NotImplementedError()

    def stop(self, timeout=None):
        self.stopevent.set()
        self.time.wake()

        while self.running():
            Sleep(0.1)

        if self.alive():
            self.join(timeout)
Exemple #2
0
    def fetch(self, file_type, ids):
        try:
            data = []

            # chunk id's into batches of size 100
            self.log(f'download {len(ids)} {file_type}s')
            batches = [ids[i:i + 100] for i in range(0, len(ids), 100)]
            for fullnames in tqdm(batches,
                                  desc=self.text() + 'fetching',
                                  unit_scale=100):
                now = datetime.now(timezone.utc).timestamp()

                # process submissions
                if file_type == 'submission':

                    # request data
                    submissions = self.reddit.info(fullnames=fullnames)

                    # parse submissions
                    data += [[
                        str(x.id),
                        str(self.subreddit),
                        str(x.author.name if x.author else '[deleted]'),
                        int(x.created_utc),
                        int(now),
                        int(x.edited),
                        int(x.pinned),
                        int(x.archived),
                        int(x.locked),
                        int(x.selftext == '[removed]'
                            or x.removed_by_category != None),
                        int(x.selftext == '[deleted]'),
                        int(x.is_self),
                        int(x.is_video),
                        int(x.is_original_content),
                        str(x.title),
                        str(x.link_flair_text),
                        float(x.upvote_ratio),
                        int(x.score),
                        int(x.gilded),
                        int(x.total_awards_received),
                        int(x.num_comments),
                        int(x.num_crossposts),
                        str(x.selftext),
                        str(x.thumbnail),
                        str(x.shortlink)
                    ] for x in submissions]

                # wait for next request
                Sleep(0.35)

            return data

        except Exception as e:
            self.log(f'...request error {repr(e)}, retry')
            Sleep(1)

        return []
Exemple #3
0
    def __init__(self, name, root, config, subreddit):
        Thread.__init__(self, name=name)
        Logger.__init__(self, name=name, context=f'r/{subreddit}', plain=False)
        Store.__init__(self, name=name, root=root, config=config, subreddit=subreddit)

        # thread events
        self.runevent = Event()
        self.stopevent = Event()

        # time helpers
        self.time = Sleep(10, immediate=False)
Exemple #4
0
def fetch(config, subreddit):
    logger = Logger('main', 'fetch', plain=True)

    loaders = []
    try:
        # pushshift
        pushshift = Pushshift(root, config, subreddit)
        loaders.append(pushshift)

        # crawler
        crawler = Crawler(root, config, subreddit)
        loaders.append(crawler)

        # praw
        praw = Praw(root, config, subreddit)
        loaders.append(praw)

        # start loader threads
        background = False  # TODO thread implementation
        for loader in loaders:
            if background:
                loader.start()
            else:
                loader.run()

        # wait until abort
        while background:
            Sleep(1)

    except KeyboardInterrupt:
        for loader in loaders:
            loader.stop(1)
        raise KeyboardInterrupt()
    except Exception as e:
        logger.log(f'...fetch error {repr(e)}')
Exemple #5
0
    def fetch(self, url, file_type, data=[]):
        try:
            now = datetime.now(timezone.utc).timestamp()

            # terminate
            created = [x[3] for x in data]
            if len(created) and created[-1] <= self.last_run[file_type]:
                return [x for x in data if x[3] > self.last_run[file_type]]

            # request data
            response = requests.get(url,
                                    headers={
                                        'User-Agent': Env.USER_AGENT()
                                    }).content
            content = html.fromstring(response)

            # parse submissions
            things = content.xpath(
                './/div[contains(@class,"thing") and @data-fullname]')
            data += [[
                x.get('data-fullname').partition('_')[2].strip(),
                self.subreddit,
                x.get('data-author'),
                int(x.get('data-timestamp')) // 1000,
                int(now)
            ] for x in things if x.get('data-fullname').startswith('t3_')]

            # fetched data
            created = [x[3] for x in data]
            self.log(
                f'fetched {len(data)} {file_type}s after {datetime.fromtimestamp(created[-1]).strftime("%Y-%m-%d %H:%M:%S")}'
            )

            # wait for next request
            Sleep(0.35)

            # parse next url
            url_next = content.xpath(
                './/a[contains(@rel,"next") and @href]/@href')
            if len(url_next):
                return self.fetch(url_next[0], file_type, data)

        except Exception as e:
            self.log(f'...request error {repr(e)}, retry')
            Sleep(1)

        return [x for x in data if x[3] > self.last_run[file_type]]
Exemple #6
0
    def stop(self, timeout=None):
        self.stopevent.set()
        self.time.wake()

        while self.running():
            Sleep(0.1)

        if self.alive():
            self.join(timeout)
Exemple #7
0
    def fetch(self, url, file_type):
        try:
            # request data
            result = requests.get(url,
                                  headers={
                                      'User-Agent': Env.USER_AGENT()
                                  }).json()

            # validate result
            if 'data' not in result or not len(result['data']):
                self.log(
                    f'fetched 0 {file_type}s after {datetime.fromtimestamp(self.last_run[file_type]).strftime("%Y-%m-%d %H:%M:%S")}'
                )
                return None

            # build data
            data = []
            for x in result['data']:

                # set last run from current item
                self.last_run[file_type] = x['created_utc'] - 1

                if file_type == 'submission' and 'selftext' in x:
                    # parse submissions
                    data += [[
                        x['id'], self.subreddit, x['author'], x['created_utc'],
                        x['retrieved_on']
                    ]]
                elif file_type == 'comment' and 'body' in x:
                    # parse comments
                    data += [[
                        x['parent_id'].partition('_')[2], x['id'],
                        self.subreddit, x['author'], x['created_utc'],
                        x['retrieved_on']
                    ]]

            # fetched data
            self.log(
                f'fetched {len(data)} {file_type}s after {datetime.fromtimestamp(self.last_run[file_type]).strftime("%Y-%m-%d %H:%M:%S")}'
            )
            return data

        except Exception as e:
            self.log(f'...request error {repr(e)}, retry')
            Sleep(1)

        return []
Exemple #8
0
    def download(self, file_type):
        count = 0
        now = int(datetime.now(timezone.utc).timestamp())

        # set last run from now
        if self.last_run[file_type] == self.end_run[file_type]:
            self.last_run[file_type] = now

        self.log(
            f'download {file_type}s before {datetime.fromtimestamp(self.last_run[file_type]).strftime("%Y-%m-%d %H:%M:%S")}'
        )

        # define columns
        columns = {
            'submission':
            ['submission', 'subreddit', 'author', 'created', 'retrieved'],
            'comment': [
                'submission', 'comment', 'subreddit', 'author', 'created',
                'retrieved'
            ]  # TODO fetch comments
        }[file_type]

        while True:

            # fetch data
            url = self.endpoint.format(file_type, self.subreddit,
                                       str(self.end_run[file_type]),
                                       str(self.last_run[file_type]))
            data = self.fetch(url, file_type)

            # validate data
            if data is None:
                if count == 0:
                    self.log(f'exported 0 {file_type}s')
                break

            # build dataframe and sort
            df = pd.DataFrame(data, columns=columns).set_index(file_type)
            df = df.sort_values(by=['created', 'retrieved'])

            # check result
            if not df.empty:
                count += df.shape[0]

                # append data
                self.write_data(file_type,
                                df,
                                overwrite=False,
                                last_run=self.last_run[file_type],
                                end_run=self.end_run[file_type])
                self.log(f'exported {df.shape[0]} {file_type}s')

            # wait for next request
            Sleep(0.35)

        # set last run and end run from now
        self.last_run[file_type] = now
        if count > 0:
            self.end_run[file_type] = now

        # update state
        self.write_meta(file_type,
                        last_run=self.last_run[file_type],
                        end_run=self.end_run[file_type])
Exemple #9
0
            config = json.load(f)

        # kaggle client
        kaggle = Kaggle(config=os.path.join('config', 'kaggle.json'))

        # start background tasks
        while not terminated:

            for subreddit in args.subreddits:
                # fetch data
                fetch(config, subreddit)

                # pause requests
                if args.pause:
                    logger.log(f'\n{"-"*45}{"PAUSING":^15}{"-"*45}\n')
                    Sleep(args.pause)
                else:
                    logger.log(f'\n{"-"*105}\n')

                # check termination
                if terminated:
                    break
            else:
                # publish data
                publish(args.publish, kaggle)

    except KeyboardInterrupt as e:
        logger.log(f'...aborted')
    except Exception as e:
        logger.log(f'...error {repr(e)}')
    finally: