def main():
    load_dotenv(find_dotenv())

    store = Store(getenv("S3_BUCKET"))

    csv_features = getLatestCSVFile(store, FEATURE_PREFIX)
    csv_features = csv_features.loc[:, csv_features.columns != "Unnamed: 0"]
    csv_features = csv_features.drop_duplicates(subset=["track_id"])

    csv_lyrics = getLatestCSVFile(store, LYRICS_PREFIX)[["track_id", "lyrics"]]
    csv_lyrics = csv_lyrics.drop_duplicates(subset=["track_id"])

    csv_recently = getLatestCSVFile(
        store, RECENTLY_PREFIX)[["track_id", "popularity", "explicit"]]

    csv_recently = csv_recently.drop_duplicates(subset=["track_id"])

    joined_data = csv_features.join(csv_lyrics.set_index("track_id"),
                                    on="track_id")

    joined_data = joined_data.join(csv_recently.set_index("track_id"),
                                   on="track_id")
    joined_data = joined_data.drop_duplicates(subset=["track_id"])

    if (validateData(joined_data, "track_id")):
        # load to buffer
        csv_buffer = StringIO()
        joined_data.to_csv(csv_buffer, index=False)
        body = csv_buffer.getvalue()

        # save file
        store.saveFile(datetime.now(), FOLDER, body, "", "csv")
Beispiel #2
0
def main():
    load_dotenv(find_dotenv())
    HOST = getenv("MYSQL_HOST")
    PORT = getenv("MYSQL_PORT")
    USER = getenv("MYSQL_USER")
    PASS = getenv("MYSQL_PASS")
    DATABASE = getenv("MYSQL_DB")

    db = DB(HOST, PORT, USER, PASS, DATABASE)
    store = Store(getenv("S3_BUCKET"))

    files = store.getFiles(PREFIX)
    file_names = list(
        filter(lambda x: ".csv" in x, map(lambda x: x.key, files)))

    latest_date = max(
        list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names)))
    latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv")

    # Get File
    body = store.getFile(latest_file)
    csv = pd.read_csv(StringIO(body), low_memory=False)
    csv = csv[[
        "artist", "album", "track", "track_id", "danceability", "energy",
        "key", "loudness", "mode", "speechiness", "acousticness",
        "instrumentalness", "liveness", "valence", "tempo", "duration_ms",
        "lyrics", "popularity", "explicit"
    ]]
    csv = csv.dropna()
    csv_tuple = [tuple(x) for x in csv.to_numpy()]
    # Load data to sql
    db.insertSongs(csv_tuple)
Beispiel #3
0
    def run(self):
        self.runevent.set()

        try:
            # download reddit data
            while not self.stopped():
                stores = [
                    Store('crawler', self.root, self.config, self.subreddit),
                    Store('pushshift', self.root, self.config, self.subreddit)
                ]
                for file_type in self.types:
                    self.download(file_type, stores)

                # periodic run
                if self.alive():
                    self.log(f'sleep for {self.periode} seconds')
                    self.time.sleep(self.periode)
                else:
                    break

        except KeyboardInterrupt:
            self.runevent.clear()
            raise KeyboardInterrupt()
        except Exception as e:
            self.log(f'...run error {repr(e)}')

        self.runevent.clear()
Beispiel #4
0
def main():
    load_dotenv(find_dotenv())
    HOST = getenv("MYSQL_HOST")
    PORT = getenv("MYSQL_PORT")
    USER = getenv("MYSQL_USER")
    PASS = getenv("MYSQL_PASS")
    DATABASE = getenv("MYSQL_DB")

    db = DB(HOST, PORT, USER, PASS, DATABASE)
    store = Store(getenv("S3_BUCKET"))

    files = store.getFiles(PREFIX)
    file_names = list(
        filter(lambda x: ".csv" in x, map(lambda x: x.key, files)))

    latest_date = max(
        list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names)))
    latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv")

    # Get File
    body = store.getFile(latest_file)
    csv = pd.read_csv(StringIO(body), low_memory=False)

    csv = csv[["track_id", "played_at"]]
    csv_tuple = [tuple(x) for x in csv.to_numpy()]

    # Load data to sql
    db.insertRecentPlays(csv_tuple)
Beispiel #5
0
    def __init__(self, name, root, config, subreddit):
        Thread.__init__(self, name=name)
        Logger.__init__(self, name=name, context=f'r/{subreddit}', plain=False)
        Store.__init__(self, name=name, root=root, config=config, subreddit=subreddit)

        # thread events
        self.runevent = Event()
        self.stopevent = Event()

        # time helpers
        self.time = Sleep(10, immediate=False)
Beispiel #6
0
 def __init__(self, base_dir):
     self.base_dir = base_dir
     self.sys_config_key = "configs/system.json"
     self.get_sys_config()
     self.store = Store(base_dir, self.sys_config)
     self.comm = Comm(self.sys_config)
     self.children = {}
     self.name = ""
     self.app = None
     self.threads = []
     self.host = ""
     self.port = 0
Beispiel #7
0
def main():
    load_dotenv(find_dotenv())

    store = Store(getenv("S3_BUCKET"))

    files = store.getFiles(PREFIX)
    file_names = list(
        filter(lambda x: ".csv" in x, map(lambda x: x.key, files)))

    latest_date = max(
        list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names)))
    latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv")

    # Get File
    body = store.getFile(latest_file)
    csv = pd.read_csv(StringIO(body), low_memory=False)
    csv = csv[['artist', 'album', 'track', 'track_id']].drop_duplicates()

    list_of_ids = csv.track_id.tolist()
    scope = "user-read-recently-played"
    sp = spotify_authenticate(scope)

    # Make a payload of 100 since api has max 100
    id_chunks = list(chunks(list_of_ids, 100))
    # Get the features
    features = []
    for chunk in id_chunks:
        resp = sp.audio_features(chunk)
        features = features + resp

    # Store json as raw format in s3
    body = store.encodeJson(features)
    store.saveFile(datetime.now(), RAW_PREFIX, body, "", RAW_EXTENSION)

    # Make panda dataframe of various features
    features_pd = pd.DataFrame(features)

    features_cols = [
        'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'id'
    ]

    features_csv = features_pd[features_cols]
    features_csv = features_csv.rename(columns={"id": "track_id"})

    merged_pd = csv.join(features_csv.set_index("track_id"), on="track_id")

    if validateData(merged_pd, "track_id"):
        # load to buffer
        csv_buffer = StringIO()
        merged_pd.to_csv(csv_buffer)
        body = csv_buffer.getvalue()

        # save file
        store.saveFile(datetime.now(), FOLDER, body, "", "csv")
Beispiel #8
0
def main():
    # Load env vars
    load_dotenv(find_dotenv())

    # Yesterday's timestamp
    yesterday_timestamp = yesterday()

    # Authenticate
    scope = "user-read-recently-played"
    sp = spotify_authenticate(scope)

    # Get last 50 songs played yesterday
    results = sp.current_user_recently_played(limit=50,
                                              after=yesterday_timestamp,
                                              before=None)

    store = Store(getenv("S3_BUCKET"))
    body = store.encodeJson(results)
    store.saveFile(yesterday_timestamp, FOLDER, body, PREFIX, FILETYPE)
def main():
    # Load env vars
    load_dotenv(find_dotenv())

    store = Store(getenv("S3_BUCKET"))

    files = store.getFiles(PREFIX)
    fileNames = list(map(lambda x: x.key, files))
    result = []
    for name in fileNames:
        file_content = json.loads(store.getFile(name))
        data_list = transformTrack(file_content["items"])
        result = result + data_list

    result_pd = pd.DataFrame(
        result, columns=COLS).drop_duplicates(subset=["played_at"])

    if validateData(result_pd, "played_at"):
        # load to buffer
        csv_buffer = StringIO()
        result_pd.to_csv(csv_buffer)
        body = csv_buffer.getvalue()

        # save file
        store.saveFile(datetime.now(), FOLDER, body, "", "csv")
def main():
    # Load env vars
    load_dotenv(find_dotenv())

    store = Store(getenv("S3_BUCKET"))
    genius = genius_authenticate(getenv("GENIUS_ACCESS_TOKEN"))

    files = store.getFiles(PREFIX)
    file_names = list(filter(lambda x: ".csv" in x, map(lambda x: x.key, files)))

    latest_date = max(list(map(lambda x: extractDate(x, PREFIX, ".csv"), file_names)))
    latest_file = "{}/{}.{}".format(PREFIX, latest_date, "csv")

    # Get File
    body = store.getFile(latest_file)
    csv = pd.read_csv(StringIO(body), low_memory=False)
    csv = csv[["artist", "track", "track_id"]].drop_duplicates()

    # Grab lyrics from genius
    csv["lyrics"] = csv.apply(lambda row: get_artist_song(genius, row), axis=1)

    csv_buffer = StringIO()
    csv.to_csv(csv_buffer)
    body = csv_buffer.getvalue()

    store.saveFile(datetime.now(), FOLDER, body, "", "csv")
Beispiel #11
0
class Task:
    """Base task class. All Tasks derive from this class
    Abstracts
    - deployment methods (service/thread/process/library/deployer)
    - request processing method (threading/?)
    - microservice library (Flask/?),
    - persistent key-value store (file system/?)
    - message sending/receive methods (REST client/?)
    """
    __metaclass__ = ABCMeta

    def __init__(self, base_dir):
        self.base_dir = base_dir
        self.sys_config_key = "configs/system.json"
        self.get_sys_config()
        self.store = Store(base_dir, self.sys_config)
        self.comm = Comm(self.sys_config)
        self.children = {}
        self.name = ""
        self.app = None
        self.threads = []
        self.host = ""
        self.port = 0

    def __del__(self):
        pass

    def get_sys_config(self):
        key = self.base_dir + "/" + self.sys_config_key
        with open(key, 'r') as fread:
            self.sys_config = json.load(fread)
        self.services = self.sys_config['services']
        self.exes = self.sys_config['exes']

    @abstractmethod
    def main(self):
        """Main function
        Defined in the child class
        """
        pass

    def run(self):
        """Run this task
        The task can be run as a service or as a library
        In either case the class' main function gets called
        """
        if self.sys_config['exec'][self.name] == "service":
            self.main()
        elif self.sys_config['exec'][self.name] == "library":
            self.main()
        elif self.sys_config['exec'][self.name] == "process":
            self.main()
        else:
            print("Error: Invalid execution mode specified in configuration")
            print("Should be either module, runtocompletion or service")
            exit(-1)

    def start_thread(self, func, data=None):
        """Start a thread within this task
        """
        sdata = None
        if data:
            sdata = json.dumps(data)
        thread = threading.Thread(target=func, args=[sdata])
        self.threads.append(thread)
        thread.start()

    def stop_thread(self):
        """Stop a running thread within this task
        """
        #TODO
        return True

    def exec_process(self, process, args=None):
        """Fork a new process to run a new task, blocking
        """
        if process in self.sys_config['exes']:
            name = self.sys_config['exes'][process]
        else:
            name = process
        sargs = " ".join(args) if args is not None else ""
        cmd = "python " + self.base_dir + "/" + name + " " + sargs
        os.system(cmd)

    def exec_process_async(self, process, args=None):
        """Fork a new process to run a new task, non blocking
        """
        if process in self.sys_config['exes']:
            name = self.sys_config['exes'][process]
        else:
            name = process

        sargs = [] if args is None else args
        cmd = ["python", self.base_dir + "/" + name] + sargs
        self.children[process] = Popen(cmd)

    def terminate_service(self, process):
        """Terminate a running task
        """
        #Todo: Send SIGTERM and wait. Send SIGKILL if child does not terminate
        if process in self.children:
            self.children[process].kill()

        #TODO: Poll
        #Will kill all running process on server
        #TODO: get PIDs from persistent store
        name = self.sys_config['exes'][process]
        for line in os.popen("ps ax | grep " + name + " | grep -v grep"):
            fields = line.split()
            pid = fields[0]
            os.kill(int(pid), signal.SIGKILL)

    def send_request(self, service, operation, parameters):
        """Send a request to a task running as a service
        """
        return self.comm.send_request(service, operation, parameters)

    def read(self, key):
        """Read a file from the persistent store
        """
        return self.store.read(key)

    def write(self, key, data):
        """Write a file to the persistent store
        """
        return self.store.write(key, data)