Ejemplo n.º 1
0
def parse_mail_data(user, data_path='.'):
    """
    Parses a user's email data in mbox format. If cache exists, returns None.

    :param {str} user - The user directory.
    :param {str} data_path - Path to the data/ directory, without the trailing /.
    :return {list} A list of messages
    """
    # First, check for cache
    if os.path.exists(f'{data_path}/saved/embeddings/mail.pickle'):
        return None

    path = f'{data_path}/data/{user}/Takeout/Mail/All mail Including Spam and Trash.mbox'

    # Check if path exists
    if not os.path.exists(path):
        warn('Mail path does not exist.')
        return []

    box = mailbox.mbox(path)
    messages = []
    for message in box:
        message_obj = GmailMboxMessage(message)
        parsed_mail = message_obj.parse_email()
        if parsed_mail != [] and not re.match('\s+', parsed_mail):
            messages.append(parsed_mail)

    return messages
    def visualize(self,
                  *args,
                  alpha=0.7,
                  titles=None,
                  reference=None,
                  **fig_kwargs):
        """
        Visualizes the data given.

        :param *args - Arguments passed to the visualization object
        :param alpha - alpha value for scatterplot
        :param reference - Reference point to plot
        :param titles - Plot titles
        :param *fig_kwargs - Arguments passed to matplotlib.figure
        """
        # Sanity check
        if titles is not None:
            if len(titles) != len(args):
                warn('Length of titles does not match args, skipping titles.')
                titles = None

        # How many to embed?
        n_vars = len(args)

        if n_vars == 0:
            warn('No args passed to embed.')
            return

        # Get number of rows
        rows = int(sqrt(n_vars))

        # Get plots
        fig, ax = plt.subplots(rows, rows, **fig_kwargs)
        fig.tight_layout()

        for i, arg in enumerate(args):
            # Reduce dimensionality to 2.
            x = self._reduce_dims(arg)

            # Reduce reference point to 2 dims
            if reference is not None:
                x_ref = self._reduce_dims(reference)

            row = int(i // 3)
            col = int(i % 3)

            ax[row][col].scatter(x.T[0], x.T[1], c='b', alpha=alpha)
            if titles is not None:
                ax[row][col].set_title(titles[i])

            if reference is not None:
                ax[row][col].scatter(x_ref.T[0],
                                     x_ref.T[1],
                                     c='r',
                                     alpha=alpha)

        plt.show()
Ejemplo n.º 3
0
def parse_fit_data(user, data_path='.'):
    """
    Parses Google Fit data for a user. If no data exists,
    returns None.

    :param {str} user - Username in the data/ directory
    :return {list} List of dictionaries of the format
    {
        'calories': float,
        'distance': float,
        'dates': list
    }
    """
    base_path = f'{data_path}/data/{user}/Takeout/Fit/Activities'

    if not os.path.exists(base_path):
        warn('Fit data does not exist.')
        return None

    if len(os.listdir(base_path)) == 0:
        warn('Fit directory is empty.')
        return None

    activities_summary = []
    for file in os.listdir(base_path):
        filename = f'{base_path}/{file}'
        tree = xml.etree.ElementTree.parse(filename)
        root = tree.getroot()

        activities = root[0]
        cur_file_summary = {'calories': 0, 'distance': 0, 'dates': []}
        for activity in activities:
            # Get the sport and date/time
            sport = activity.attrib['Sport']
            date = activity[0].text

            # Walk down the XML tree
            lap = activity[1]
            track = lap[0]

            try:
                calories = float(lap.find('Calories').text)
            except AttributeError:
                calories = 0

            try:
                distance = float(lap.find('DistanceMeters').text)
            except AttributeError:
                distance = 0

            cur_file_summary['calories'] += calories
            cur_file_summary['distance'] += distance
            cur_file_summary['dates'].append(date)

        activities_summary.append(cur_file_summary)

    return activities_summary
def parse_autofill(user, data_path='.'):
    """
    Parses the user's autofill data.

    :param {str} user - The user directory.
    :return {list} A list of places nearby
    """
    # Check for a cache
    if os.path.exists('caches/.autofill.cache'):
        with open('caches/.autofill.cache', 'r') as f:
            # Safe evaluation with ast
            result = ast.literal_eval(f.readline())

            # Additional sanitary check
            assert isinstance(result, list)
            return result

    path = f'{data_path}/data/{user}/Takeout/Chrome/Autofill.json'
    if not os.path.exists(path):
        warn('Path the Chrome autofill data does not exist.')
        return []

    with open(path, 'r') as f:
        profile = json.load(f)

    profile = profile['Autofill Profile']

    # Is the list of profiles empty?
    if len(profile) == 0:
        return []

    addresses = [p['address_home_street_address'] for p in profile]

    # Geocode the addresses
    client = googlemaps.Client(key=get_key())
    places = []
    for address in addresses:
        coords = client.geocode(address=address)
        coords = list(coords[0]['geometry']['location'].values())
        places.extend(get_nearby_places(coords))

    # Save to cache
    with open('caches/.autofill.cache', 'w') as f:
        f.write(str(places))

    return places
Ejemplo n.º 5
0
def parse_subscribed_channels(user, data_path='.'):
    """
    Mines a user's youtube subscribed channels of given data.
    :param {str} user - The user directory.
    :param {str} data_path - Path to the data/ directory, NOT ending in a /.
    :return {list} List of dictionaries of the format
    """
    path = f'{data_path}/data/{user}/Takeout/YouTube and YouTube Music/subscriptions/subscriptions.json'

    # Check if the directory exists.
    if not os.path.exists(path):
        warn('YouTube watch history path does not exist.')
        return []

    with open(path, encoding="utf8") as f:
        data = json.load(f)

    return [s.get('snippet').get('title') for s in data]
Ejemplo n.º 6
0
def parse_yt_comments(user, data_path='.'):
    """
    Mines a user's youtube comment history on any video.
    :param {str} user - The user directory.
    :param {str} data_path - Path to the data/ directory, NOT ending in a /.
    :return {list} List of comments
    """
    path = f'{data_path}/data/{user}/Takeout/YouTube and YouTube Music/my-comments/my-comments.html'

    # Check if the directory exists.
    if not os.path.exists(path):
        warn('YouTube watch history path does not exist.')
        return []

    with open(path, encoding="utf8") as f:
        s = BeautifulSoup(f, 'html.parser')

    return [x.find('br').next_element.strip() for x in s.findAll('li')]
Ejemplo n.º 7
0
def parse_liked_videos(user, data_path='.'):
    """
    Mines a user's youtube liked videos of given user.
    :param {str} user - The user directory.
    :param {str} data_path - Path to the data/ directory, NOT ending in a /.
    :return {list} List of titles of liked videos
    """
    cache_dir = f'{data_path}/caches/.yt.cache'

    # Check for a cache
    if os.path.exists(cache_dir):
        f = open(cache_dir, 'r')
        line = f.readline()
        f.close()

        return ast.literal_eval(line)

    path = f'{data_path}/data/{user}/Takeout/YouTube and YouTube Music/playlists/Liked videos.csv'

    if not os.path.exists(path):
        warn('YouTube watch history file does not exist.')
        return []

    liked_video_ids = []
    with open(path, newline='') as f:
        read = csv.reader(f)
        for r in read:
            if r is not None and len(r) > 0:
                liked_video_ids.append(r[0])

    youtube = build("youtube", "v3", developerKey=get_key())

    liked_video_titles = []
    for i in liked_video_ids:
        res = youtube.videos().list(part='snippet', id=i).execute()
        if (res['items'] != []):
            stats = res['items'][0]['snippet']['title']
            liked_video_titles.append(stats)

    # Cache results
    with open(cache_dir, 'w') as f:
        f.write(str(liked_video_titles))

    return liked_video_titles
Ejemplo n.º 8
0
def parse_yt_watch_history(user, data_path='.'):
    """
    Mines a user's youtube watch history.
    :param {str} user - The user directory.
    :param {str} data_path - Path to the data/ directory, NOT ending in a /.
    :return {list} List of titles of videos watched.
    """
    path = f'{data_path}/data/{user}/Takeout/YouTube and YouTube Music/history/watch-history.html'

    # Check if the directory exists.
    if not os.path.exists(path):
        warn('YouTube watch history file does not exist.')
        return []

    # mine the data and retrieve the needed distinctions
    with open(path, encoding="utf8") as f:
        s = BeautifulSoup(f, 'html.parser')

    return [x.text for x in s.findAll('a')]
def parse_browser_history(user, data_path='.'):
    """
    Parses the user's browser history.

    :param {str} user - The user name.
    :return {list} A list of page titles.
    """
    path = f'{data_path}/data/{user}/Takeout/Chrome/BrowserHistory.json'
    if not os.path.exists(path):
        warn('BrowserHistory.json does not exist.')
        return []

    with open(path, 'r') as f:
        profile = json.load(f)

    profile = profile['Browser History']

    # Is the history empty?
    if len(profile) == 0:
        return []

    return list(map(lambda p: p['title'], profile))