def main(): load_dotenv(find_dotenv()) store = Store(getenv("S3_BUCKET")) csv_features = getLatestCSVFile(store, FEATURE_PREFIX) csv_features = csv_features.loc[:, csv_features.columns != "Unnamed: 0"] csv_features = csv_features.drop_duplicates(subset=["track_id"]) csv_lyrics = getLatestCSVFile(store, LYRICS_PREFIX)[["track_id", "lyrics"]] csv_lyrics = csv_lyrics.drop_duplicates(subset=["track_id"]) csv_recently = getLatestCSVFile( store, RECENTLY_PREFIX)[["track_id", "popularity", "explicit"]] csv_recently = csv_recently.drop_duplicates(subset=["track_id"]) joined_data = csv_features.join(csv_lyrics.set_index("track_id"), on="track_id") joined_data = joined_data.join(csv_recently.set_index("track_id"), on="track_id") joined_data = joined_data.drop_duplicates(subset=["track_id"]) if (validateData(joined_data, "track_id")): # load to buffer csv_buffer = StringIO() joined_data.to_csv(csv_buffer, index=False) body = csv_buffer.getvalue() # save file store.saveFile(datetime.now(), FOLDER, body, "", "csv")
def main(): load_dotenv(find_dotenv()) HOST = getenv("MYSQL_HOST") PORT = getenv("MYSQL_PORT") USER = getenv("MYSQL_USER") PASS = getenv("MYSQL_PASS") DATABASE = getenv("MYSQL_DB") db = DB(HOST, PORT, USER, PASS, DATABASE) store = Store(getenv("S3_BUCKET")) files = store.getFiles(PREFIX) file_names = list( filter(lambda x: ".csv" in x, map(lambda x: x.key, files))) latest_date = max( list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names))) latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv") # Get File body = store.getFile(latest_file) csv = pd.read_csv(StringIO(body), low_memory=False) csv = csv[[ "artist", "album", "track", "track_id", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "lyrics", "popularity", "explicit" ]] csv = csv.dropna() csv_tuple = [tuple(x) for x in csv.to_numpy()] # Load data to sql db.insertSongs(csv_tuple)
def run(self): self.runevent.set() try: # download reddit data while not self.stopped(): stores = [ Store('crawler', self.root, self.config, self.subreddit), Store('pushshift', self.root, self.config, self.subreddit) ] for file_type in self.types: self.download(file_type, stores) # periodic run if self.alive(): self.log(f'sleep for {self.periode} seconds') self.time.sleep(self.periode) else: break except KeyboardInterrupt: self.runevent.clear() raise KeyboardInterrupt() except Exception as e: self.log(f'...run error {repr(e)}') self.runevent.clear()
def main(): load_dotenv(find_dotenv()) HOST = getenv("MYSQL_HOST") PORT = getenv("MYSQL_PORT") USER = getenv("MYSQL_USER") PASS = getenv("MYSQL_PASS") DATABASE = getenv("MYSQL_DB") db = DB(HOST, PORT, USER, PASS, DATABASE) store = Store(getenv("S3_BUCKET")) files = store.getFiles(PREFIX) file_names = list( filter(lambda x: ".csv" in x, map(lambda x: x.key, files))) latest_date = max( list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names))) latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv") # Get File body = store.getFile(latest_file) csv = pd.read_csv(StringIO(body), low_memory=False) csv = csv[["track_id", "played_at"]] csv_tuple = [tuple(x) for x in csv.to_numpy()] # Load data to sql db.insertRecentPlays(csv_tuple)
def __init__(self, name, root, config, subreddit): Thread.__init__(self, name=name) Logger.__init__(self, name=name, context=f'r/{subreddit}', plain=False) Store.__init__(self, name=name, root=root, config=config, subreddit=subreddit) # thread events self.runevent = Event() self.stopevent = Event() # time helpers self.time = Sleep(10, immediate=False)
def __init__(self, base_dir): self.base_dir = base_dir self.sys_config_key = "configs/system.json" self.get_sys_config() self.store = Store(base_dir, self.sys_config) self.comm = Comm(self.sys_config) self.children = {} self.name = "" self.app = None self.threads = [] self.host = "" self.port = 0
def main(): load_dotenv(find_dotenv()) store = Store(getenv("S3_BUCKET")) files = store.getFiles(PREFIX) file_names = list( filter(lambda x: ".csv" in x, map(lambda x: x.key, files))) latest_date = max( list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names))) latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv") # Get File body = store.getFile(latest_file) csv = pd.read_csv(StringIO(body), low_memory=False) csv = csv[['artist', 'album', 'track', 'track_id']].drop_duplicates() list_of_ids = csv.track_id.tolist() scope = "user-read-recently-played" sp = spotify_authenticate(scope) # Make a payload of 100 since api has max 100 id_chunks = list(chunks(list_of_ids, 100)) # Get the features features = [] for chunk in id_chunks: resp = sp.audio_features(chunk) features = features + resp # Store json as raw format in s3 body = store.encodeJson(features) store.saveFile(datetime.now(), RAW_PREFIX, body, "", RAW_EXTENSION) # Make panda dataframe of various features features_pd = pd.DataFrame(features) features_cols = [ 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'id' ] features_csv = features_pd[features_cols] features_csv = features_csv.rename(columns={"id": "track_id"}) merged_pd = csv.join(features_csv.set_index("track_id"), on="track_id") if validateData(merged_pd, "track_id"): # load to buffer csv_buffer = StringIO() merged_pd.to_csv(csv_buffer) body = csv_buffer.getvalue() # save file store.saveFile(datetime.now(), FOLDER, body, "", "csv")
def main(): # Load env vars load_dotenv(find_dotenv()) # Yesterday's timestamp yesterday_timestamp = yesterday() # Authenticate scope = "user-read-recently-played" sp = spotify_authenticate(scope) # Get last 50 songs played yesterday results = sp.current_user_recently_played(limit=50, after=yesterday_timestamp, before=None) store = Store(getenv("S3_BUCKET")) body = store.encodeJson(results) store.saveFile(yesterday_timestamp, FOLDER, body, PREFIX, FILETYPE)
def main(): # Load env vars load_dotenv(find_dotenv()) store = Store(getenv("S3_BUCKET")) files = store.getFiles(PREFIX) fileNames = list(map(lambda x: x.key, files)) result = [] for name in fileNames: file_content = json.loads(store.getFile(name)) data_list = transformTrack(file_content["items"]) result = result + data_list result_pd = pd.DataFrame( result, columns=COLS).drop_duplicates(subset=["played_at"]) if validateData(result_pd, "played_at"): # load to buffer csv_buffer = StringIO() result_pd.to_csv(csv_buffer) body = csv_buffer.getvalue() # save file store.saveFile(datetime.now(), FOLDER, body, "", "csv")
def main(): # Load env vars load_dotenv(find_dotenv()) store = Store(getenv("S3_BUCKET")) genius = genius_authenticate(getenv("GENIUS_ACCESS_TOKEN")) files = store.getFiles(PREFIX) file_names = list(filter(lambda x: ".csv" in x, map(lambda x: x.key, files))) latest_date = max(list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names))) latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv") # Get File body = store.getFile(latest_file) csv = pd.read_csv(StringIO(body), low_memory=False) csv = csv[["artist", "track", "track_id"]].drop_duplicates() # Grab lyrics from genius csv["lyrics"] = csv.apply(lambda row: get_artist_song(genius, row), axis=1) csv_buffer = StringIO() csv.to_csv(csv_buffer) body = csv_buffer.getvalue() store.saveFile(datetime.now(), FOLDER, body, "", "csv")
class Task: """Base task class. All Tasks derive from this class Abstracts - deployment methods (service/thread/process/library/deployer) - request processing method (threading/?) - microservice library (Flask/?), - persistent key-value store (file system/?) - message sending/receive methods (REST client/?) """ __metaclass__ = ABCMeta def __init__(self, base_dir): self.base_dir = base_dir self.sys_config_key = "configs/system.json" self.get_sys_config() self.store = Store(base_dir, self.sys_config) self.comm = Comm(self.sys_config) self.children = {} self.name = "" self.app = None self.threads = [] self.host = "" self.port = 0 def __del__(self): pass def get_sys_config(self): key = self.base_dir + "/" + self.sys_config_key with open(key, 'r') as fread: self.sys_config = json.load(fread) self.services = self.sys_config['services'] self.exes = self.sys_config['exes'] @abstractmethod def main(self): """Main function Defined in the child class """ pass def run(self): """Run this task The task can be run as a service or as a library In either case the class' main function gets called """ if self.sys_config['exec'][self.name] == "service": self.main() elif self.sys_config['exec'][self.name] == "library": self.main() elif self.sys_config['exec'][self.name] == "process": self.main() else: print("Error: Invalid execution mode specified in configuration") print("Should be either module, runtocompletion or service") exit(-1) def start_thread(self, func, data=None): """Start a thread within this task """ sdata = None if data: sdata = json.dumps(data) thread = threading.Thread(target=func, args=[sdata]) self.threads.append(thread) thread.start() def stop_thread(self): """Stop a running thread within this task """ #TODO return True def exec_process(self, process, args=None): """Fork a new process to run a new task, blocking """ if process in self.sys_config['exes']: name = self.sys_config['exes'][process] else: name = process sargs = " ".join(args) if args is not None else "" cmd = "python " + self.base_dir + "/" + name + " " + sargs os.system(cmd) def exec_process_async(self, process, args=None): """Fork a new process to run a new task, non blocking """ if process in self.sys_config['exes']: name = self.sys_config['exes'][process] else: name = process sargs = [] if args is None else args cmd = ["python", self.base_dir + "/" + name] + sargs self.children[process] = Popen(cmd) def terminate_service(self, process): """Terminate a running task """ #Todo: Send SIGTERM and wait. Send SIGKILL if child does not terminate if process in self.children: self.children[process].kill() #TODO: Poll #Will kill all running process on server #TODO: get PIDs from persistent store name = self.sys_config['exes'][process] for line in os.popen("ps ax | grep " + name + " | grep -v grep"): fields = line.split() pid = fields[0] os.kill(int(pid), signal.SIGKILL) def send_request(self, service, operation, parameters): """Send a request to a task running as a service """ return self.comm.send_request(service, operation, parameters) def read(self, key): """Read a file from the persistent store """ return self.store.read(key) def write(self, key, data): """Write a file to the persistent store """ return self.store.write(key, data)