コード例 #1
0
class SocialHarvestTask(luigi.WrapperTask):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    debug = luigi.BoolParameter(default=False)

    def requires(self):
        yield TelegramMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug)
        yield TwitterMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug)
コード例 #2
0
def job():
    with open(DISCORD_WEBTOKEN, 'r') as f:
        url = f.read().strip()

    discord_bogdabot = DiscordBot(url=url, events=['SUCCESS', 'FAILURE'])

    date_ = date.today()
    hour_ = get_current_hour()
    debug_ = False
    with notify(discord_bogdabot):
        luigi.build([SocialHarvestTask(date=date_, hour=hour_, debug=debug_)], workers=2)
コード例 #3
0
class ParseTelegramMemberCountTask(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    limit = luigi.Parameter(default=None)  # DEBUG: REMOVE THIS!!!!

    def requires(self):
        return [base_pipe.CreateDateFolder(date=self.date), base_pipe.ParseTelegramJSONtoCSVTask(date=self.date)]

    def output(self):
        path = Path(str(self.input()[0].path)) / f'Telegram_Data_{self.hour}.csv'
        return luigi.LocalTarget(str(path))

    def run(self):
        telegram_links = []

        with self.input()[1].open('r') as f:
            reader = csv.reader(f)
            header = next(reader)

            for i, row in enumerate(reader):
                name = row[0]
                link = row[1]
                telegram_links.append({'name': name, 'link': link})  # IDEA: tuple
                if self.limit:
                    if i > self.limit:
                        break

        max_processes = cpu_count() * 2

        with Pool(max_processes) as p:
            member_records = p.map(parse_member_count, telegram_links)

        if len(member_records) > 0:
            df = pd.DataFrame(member_records)
            df.set_index('name', inplace=True)

            mean_series = df.groupby(df.index)['members'].mean()

            sum_series = df.groupby(df.index)['members'].sum()

            median_series = df.groupby(df.index)['members'].median()

            count_series = df.groupby(df.index).count()

            data = pd.concat([sum_series,
                              mean_series,
                              median_series,
                              count_series], axis=1)
            data.columns = ['sum', 'mean', 'median', 'link_count']
            data.dropna(inplace=True)
            data.to_csv(self.output().path)
コード例 #4
0
class TwitterMembersToDatabaseTask(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    debug = luigi.BoolParameter(default=False)  # NOTE: SUPER DANGEROUS WILL SCRUB DATABASE

    def requires(self):
        return ParseTwitterMemberCountTask(date=self.date, hour=self.hour)

    def run(self):
        if not Twitter.table_exists():
            create_twitter_table()

        if not self.complete():

            df = pd.read_csv(self.input().path)
            df.set_index('name', inplace=True)
            for name, row in df.iterrows():
                followers = row['followers']
                following = row['following']
                likes = row['likes']
                tweets = row['tweets']

                data = {'name': name, 'followers': followers, 'following': following, 'likes': likes,
                        'tweets': tweets, 'date': self.hour}
                Twitter.add_member_data(**data)

        # TODO: Twitter LINKS, RAW DATA. rename csv files

    def complete(self):
        # TODO: Add task to create a DB/Table or
        # IDEA: Add an except for no table - create table then check databsse for complete
        if self.debug:
            clean_twitter_table()  # DEBUG: REMOVE
            print('DELETING TABLE FOR DEBUGGING!!!!!!!!!!!!!!!!!')
        try:
            local_ = pd.read_csv(self.input().path)
            dbase_ = Twitter.data_by_date(self.hour)
            print('#' * 25)
            print(len(local_))  # TODO: Logging
            print(len(dbase_))  # TODO: Logging
            # TODO: If else raise data not written to db
            print(len(local_.index) == len(dbase_.index))
            # TODO: If else raise data not written to db
            print('#' * 25)
            return len(local_.index) == len(dbase_.index)

        except (FileNotFoundError, KeyError):
            print()
            return False
コード例 #5
0
class TelegramMembersToDatabaseTask(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    debug = luigi.BoolParameter(default=False)  # NOTE: SUPER DANGEROUS WILL SCRUB DATABASE

    def requires(self):
        return ParseTelegramMemberCountTask(date=self.date, hour=self.hour)

    def run(self):
        if not Telegram.table_exists():
            create_telegram_table()

        if not self.complete():

            df = pd.read_csv(self.input().path)
            df.set_index('name', inplace=True)
            for name, row in df.round(2).iterrows():
                _mean = row['mean']
                _median = row['median']
                _sum = row['sum']
                _count = row['link_count']

                data = {'name': name, 'mean': int(_mean), 'median': int(_median),
                        'sum': int(_sum), 'count': int(_count), 'date': self.hour}
                Telegram.add_member_data(**data)

            # TODO: TELEGRAM LINKS, RAW DATA. rename csv files

    def complete(self):
        # TODO: Add task to create a DB/Table or
        # IDEA: Add an except for no table - create table then check databsse for complete
        if self.debug:
            clean_telegram_table()  # DEBUG: REMOVE
            print('DELETING TABLE FOR DEBUGGING!!!!!!!!!!!!!!!!!')
        try:
            local_ = pd.read_csv(self.input().path)
            dbase_ = Telegram.data_by_date(self.hour)
            print('#' * 25)
            print(len(local_))  # TODO: Logging
            print(len(dbase_))  # TODO: Logging
            # TODO: If else raise data not written to db
            print(len(local_.index) == len(dbase_.index))
            # TODO: If else raise data not written to db
            print('#' * 25)
            return len(local_.index) == len(dbase_.index)

        except (FileNotFoundError, KeyError) as e:
            print()
            return False
コード例 #6
0
class ParseTwitterMemberCountTask(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    hour = luigi.DateHourParameter(default=get_current_hour())
    limit = luigi.Parameter(default=None)

    def requires(self):
        return [base_pipe.CreateDateFolder(date=self.date), base_pipe.ParseTwitterJSONtoCSVTask(date=self.date)]

    def output(self):
        path = Path(str(self.input()[0].path)) / f'Twitter_Data_{self.hour}.csv'
        return luigi.LocalTarget(str(path))

    def run(self):
        twitter_links = []

        with self.input()[1].open('r') as f:
            reader = csv.reader(f)
            header = next(reader)

            for i, row in enumerate(reader):
                name = row[0]
                link = row[1]
                if not bad_links.Twitter.is_link_bad(link):
                    twitter_links.append({'name': name, 'link': link})  # IDEA: tuple
                if self.limit:
                    if i > self.limit:
                        break

        max_processes = cpu_count() * 2
        print(f'parsing {len(twitter_links)} twitter links')

        with Pool(max_processes) as p:
            member_records = p.map(twitter.parse_twitter_count, twitter_links)

        if len(member_records) > 0:
            df = pd.DataFrame(member_records)
            df.set_index('name', inplace=True)
            df.to_csv(self.output().path)