class SocialHarvestTask(luigi.WrapperTask): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) debug = luigi.BoolParameter(default=False) def requires(self): yield TelegramMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug) yield TwitterMembersToDatabaseTask(date=self.date, hour=self.hour, debug=self.debug)
def job(): with open(DISCORD_WEBTOKEN, 'r') as f: url = f.read().strip() discord_bogdabot = DiscordBot(url=url, events=['SUCCESS', 'FAILURE']) date_ = date.today() hour_ = get_current_hour() debug_ = False with notify(discord_bogdabot): luigi.build([SocialHarvestTask(date=date_, hour=hour_, debug=debug_)], workers=2)
class ParseTelegramMemberCountTask(luigi.Task): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) limit = luigi.Parameter(default=None) # DEBUG: REMOVE THIS!!!! def requires(self): return [base_pipe.CreateDateFolder(date=self.date), base_pipe.ParseTelegramJSONtoCSVTask(date=self.date)] def output(self): path = Path(str(self.input()[0].path)) / f'Telegram_Data_{self.hour}.csv' return luigi.LocalTarget(str(path)) def run(self): telegram_links = [] with self.input()[1].open('r') as f: reader = csv.reader(f) header = next(reader) for i, row in enumerate(reader): name = row[0] link = row[1] telegram_links.append({'name': name, 'link': link}) # IDEA: tuple if self.limit: if i > self.limit: break max_processes = cpu_count() * 2 with Pool(max_processes) as p: member_records = p.map(parse_member_count, telegram_links) if len(member_records) > 0: df = pd.DataFrame(member_records) df.set_index('name', inplace=True) mean_series = df.groupby(df.index)['members'].mean() sum_series = df.groupby(df.index)['members'].sum() median_series = df.groupby(df.index)['members'].median() count_series = df.groupby(df.index).count() data = pd.concat([sum_series, mean_series, median_series, count_series], axis=1) data.columns = ['sum', 'mean', 'median', 'link_count'] data.dropna(inplace=True) data.to_csv(self.output().path)
class TwitterMembersToDatabaseTask(luigi.Task): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) debug = luigi.BoolParameter(default=False) # NOTE: SUPER DANGEROUS WILL SCRUB DATABASE def requires(self): return ParseTwitterMemberCountTask(date=self.date, hour=self.hour) def run(self): if not Twitter.table_exists(): create_twitter_table() if not self.complete(): df = pd.read_csv(self.input().path) df.set_index('name', inplace=True) for name, row in df.iterrows(): followers = row['followers'] following = row['following'] likes = row['likes'] tweets = row['tweets'] data = {'name': name, 'followers': followers, 'following': following, 'likes': likes, 'tweets': tweets, 'date': self.hour} Twitter.add_member_data(**data) # TODO: Twitter LINKS, RAW DATA. rename csv files def complete(self): # TODO: Add task to create a DB/Table or # IDEA: Add an except for no table - create table then check databsse for complete if self.debug: clean_twitter_table() # DEBUG: REMOVE print('DELETING TABLE FOR DEBUGGING!!!!!!!!!!!!!!!!!') try: local_ = pd.read_csv(self.input().path) dbase_ = Twitter.data_by_date(self.hour) print('#' * 25) print(len(local_)) # TODO: Logging print(len(dbase_)) # TODO: Logging # TODO: If else raise data not written to db print(len(local_.index) == len(dbase_.index)) # TODO: If else raise data not written to db print('#' * 25) return len(local_.index) == len(dbase_.index) except (FileNotFoundError, KeyError): print() return False
class TelegramMembersToDatabaseTask(luigi.Task): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) debug = luigi.BoolParameter(default=False) # NOTE: SUPER DANGEROUS WILL SCRUB DATABASE def requires(self): return ParseTelegramMemberCountTask(date=self.date, hour=self.hour) def run(self): if not Telegram.table_exists(): create_telegram_table() if not self.complete(): df = pd.read_csv(self.input().path) df.set_index('name', inplace=True) for name, row in df.round(2).iterrows(): _mean = row['mean'] _median = row['median'] _sum = row['sum'] _count = row['link_count'] data = {'name': name, 'mean': int(_mean), 'median': int(_median), 'sum': int(_sum), 'count': int(_count), 'date': self.hour} Telegram.add_member_data(**data) # TODO: TELEGRAM LINKS, RAW DATA. rename csv files def complete(self): # TODO: Add task to create a DB/Table or # IDEA: Add an except for no table - create table then check databsse for complete if self.debug: clean_telegram_table() # DEBUG: REMOVE print('DELETING TABLE FOR DEBUGGING!!!!!!!!!!!!!!!!!') try: local_ = pd.read_csv(self.input().path) dbase_ = Telegram.data_by_date(self.hour) print('#' * 25) print(len(local_)) # TODO: Logging print(len(dbase_)) # TODO: Logging # TODO: If else raise data not written to db print(len(local_.index) == len(dbase_.index)) # TODO: If else raise data not written to db print('#' * 25) return len(local_.index) == len(dbase_.index) except (FileNotFoundError, KeyError) as e: print() return False
class ParseTwitterMemberCountTask(luigi.Task): date = luigi.DateParameter(default=date.today()) hour = luigi.DateHourParameter(default=get_current_hour()) limit = luigi.Parameter(default=None) def requires(self): return [base_pipe.CreateDateFolder(date=self.date), base_pipe.ParseTwitterJSONtoCSVTask(date=self.date)] def output(self): path = Path(str(self.input()[0].path)) / f'Twitter_Data_{self.hour}.csv' return luigi.LocalTarget(str(path)) def run(self): twitter_links = [] with self.input()[1].open('r') as f: reader = csv.reader(f) header = next(reader) for i, row in enumerate(reader): name = row[0] link = row[1] if not bad_links.Twitter.is_link_bad(link): twitter_links.append({'name': name, 'link': link}) # IDEA: tuple if self.limit: if i > self.limit: break max_processes = cpu_count() * 2 print(f'parsing {len(twitter_links)} twitter links') with Pool(max_processes) as p: member_records = p.map(twitter.parse_twitter_count, twitter_links) if len(member_records) > 0: df = pd.DataFrame(member_records) df.set_index('name', inplace=True) df.to_csv(self.output().path)