Ejemplo n.º 1
0
 def parse_voice_audio_log_body(dic_my_activity_voice_audio,
                                voice_audio_logs):
     list_voice_audio_event_logs = TakeoutHtmlParser.find_log_body(
         voice_audio_logs)
     if list_voice_audio_event_logs != []:
         idx = 0
         for content in list_voice_audio_event_logs:
             content = str(content).strip()
             content = content.replace(u'\xa0', ' ')
             if idx == 0:
                 if content.startswith('Said'):
                     dic_my_activity_voice_audio['type'] = 'Search'
                     if content != 'Said':
                         dic_my_activity_voice_audio['keyword'] = content[
                             4:].lstrip()
                 else:
                     dic_my_activity_voice_audio['type'] = content
             else:
                 if idx == 1 and dic_my_activity_voice_audio[
                         'type'] == 'Search':
                     if content.startswith('<a href="'):
                         idx2 = content.find('">')
                         keyword = content[idx2 + 2:content.find('</a>')]
                         dic_my_activity_voice_audio[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                         url = content[9:idx2]
                         url = unquote(url)
                         dic_my_activity_voice_audio['keyword_url'] = url
                 elif content.endswith('UTC'):
                     dic_my_activity_voice_audio[
                         'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime(
                             content)
             idx += 1
 def parse_gmail_log_body(dic_my_activity_gmail, gmail_logs):
     list_gmail_search_logs = TakeoutHtmlParser.find_log_body(gmail_logs)
     if list_gmail_search_logs != []:
         idx = 0
         for content in list_gmail_search_logs:
             content = str(content).strip()
             content = content.replace(u'\xa0', ' ')
             if idx == 0:
                 if content == 'Searched for':
                     dic_my_activity_gmail['type'] = 'Search'
                 else:
                     dic_my_activity_gmail['type'] = content
             else:
                 if idx == 1:
                     if content.startswith('<a href="'):
                         idx2 = content.find('">')
                         url = content[9:idx2]
                         url = unquote(url)
                         dic_my_activity_gmail['keyword_url'] = url
                         keyword = content[idx2 + 2:content.find('</a>')]
                         dic_my_activity_gmail[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                 elif content.endswith('UTC'):
                     dic_my_activity_gmail[
                         'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime(
                             content)
             idx += 1
    def parse_analytics_log_body(dic_my_activity_google_analytics,
                                 analytics_logs):
        list_analytics_event_logs = TakeoutHtmlParser.find_log_body(
            analytics_logs)
        if list_analytics_event_logs != []:
            idx = 0
            for content in list_analytics_event_logs:
                content = str(content).strip()
                content = content.replace(u'\xa0', ' ')
                if idx == 0:
                    if content == 'Used':
                        dic_my_activity_google_analytics['type'] = 'Use'
                    elif content == 'Visited':
                        dic_my_activity_google_analytics['type'] = 'Visit'
                    else:
                        dic_my_activity_google_analytics['type'] = content
                else:
                    if idx == 1:
                        if content.startswith('<a href="'):
                            idx2 = content.find('">')
                            keyword = content[idx2 + 2:content.find('</a>')]
                            dic_my_activity_google_analytics[
                                'keyword'] = TakeoutHtmlParser.remove_special_char(
                                    keyword)
                            url = content[9:idx2]
                            url = unquote(url)
                            dic_my_activity_google_analytics[
                                'keyword_url'] = url
                            o = urlparse(url)
                            if o.query.startswith('q=') and o.query.find(
                                    '&amp;'):
                                real_url = o.query[2:o.query.find('&amp;')]
                                real_url = unquote(real_url)
                                dic_my_activity_google_analytics[
                                    'keyword_url'] = real_url
                                o = urlparse(real_url)
                                if o.netloc.startswith('m.'):
                                    dic_my_activity_google_analytics[
                                        'used_device'] = 'mobile'

                            if o.netloc.startswith('m.'):
                                dic_my_activity_google_analytics[
                                    'used_device'] = 'mobile'
                    elif content.endswith('UTC'):
                        dic_my_activity_google_analytics[
                            'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime(
                                content)
                idx += 1
Ejemplo n.º 4
0
 def parse_maps(case):
     file_path = case.takeout_my_activity_maps_path
     if os.path.exists(file_path) == False:
         return False
     with open(file_path, 'r', encoding='utf-8') as f:
         file_contents = f.read()
         soup = BeautifulSoup(file_contents, 'lxml')
         list_maps_logs = TakeoutHtmlParser.find_log(soup)
         if list_maps_logs != []:
             for i in trange(
                     len(list_maps_logs),
                     desc=
                     "[Parsing the My Activity -> Maps data...............]",
                     unit="epoch"):
                 # print("..........................................................................")
                 dic_my_activity_maps = {'timestamp':"", 'service':"", 'type':"", 'keyword':"", 'keyword_url':"", \
                 'keyword_latitude':"", 'keyword_longitude':"", 'latitude':"", 'longitude':"", 'geodata_description':"", \
                 'used_device':""}
                 MyActivityMaps.parse_maps_log_title(
                     dic_my_activity_maps, list_maps_logs[i])
                 MyActivityMaps.parse_maps_log_body(dic_my_activity_maps,
                                                    list_maps_logs[i])
                 MyActivityMaps.parse_maps_log_caption(
                     dic_my_activity_maps, list_maps_logs[i])
                 MyActivityMaps.insert_log_info_to_preprocess_db(
                     dic_my_activity_maps, case.preprocess_db_path)
Ejemplo n.º 5
0
 def parse_maps_log_caption(dic_my_activity_maps, maps_logs):
     list_maps_logs = TakeoutHtmlParser.find_log_caption(maps_logs)
     if list_maps_logs != []:
         for content in list_maps_logs:
             content = str(content).strip()
             if content == '<br/>': continue
             elif content.startswith(
                     '<a href="https://www.google.com/maps/'):
                 idx2 = content.find('">')
                 url = content[9:idx2]
                 o = urlparse(url)
                 list_query_value = o.query.split(';')
                 if list_query_value != []:
                     for query_value in list_query_value:
                         if query_value.startswith('center='):
                             geodata = query_value.lstrip('center=').rstrip(
                                 '&amp')
                             dic_my_activity_maps[
                                 'latitude'] = geodata.split(',')[0]
                             dic_my_activity_maps[
                                 'longitude'] = geodata.split(',')[1]
                         elif query_value.startswith('query='):
                             geodata = query_value.lstrip('query=')
                             dic_my_activity_maps[
                                 'latitude'] = geodata.split(',')[0]
                             dic_my_activity_maps[
                                 'longitude'] = geodata.split(',')[1]
                 dic_my_activity_maps['geodata_description'] = content[
                     idx2 + 2:content.find('</a>')]
             elif content == '- From your device':
                 dic_my_activity_maps['used_device'] = 'mobile'
 def parse_gmail(case):
     file_path = case.takeout_my_activity_gmail_path
     if os.path.exists(file_path) == False:
         return False
     with open(file_path, 'r', encoding='utf-8') as f:
         file_contents = f.read()
         soup = BeautifulSoup(file_contents, 'lxml')
         list_gmail_logs = TakeoutHtmlParser.find_log(soup)
         if list_gmail_logs != []:
             for i in trange(
                     len(list_gmail_logs),
                     desc=
                     "[Parsing the My Activity -> Gmail data..............]",
                     unit="epoch"):
                 # print("..........................................................................")
                 dic_my_activity_gmail = {
                     'service': "",
                     'type': "",
                     'keyword_url': "",
                     'keyword': "",
                     'timestamp': ""
                 }
                 MyActivityGmail.parse_gmail_log_title(
                     dic_my_activity_gmail, list_gmail_logs[i])
                 MyActivityGmail.parse_gmail_log_body(
                     dic_my_activity_gmail, list_gmail_logs[i])
                 MyActivityGmail.insert_log_info_to_preprocess_db(
                     dic_my_activity_gmail, case.preprocess_db_path)
 def parse_assistant_log_caption(dic_my_activity_assistant, assistant_logs):
     list_assistant_geodata_logs = TakeoutHtmlParser.find_log_caption(
         assistant_logs)
     if list_assistant_geodata_logs != []:
         for content in list_assistant_geodata_logs:
             content = str(content).strip()
             if content == '<br/>': continue
             if content.startswith('<a href="https://www.google.com/maps/'):
                 idx = content.find('">')
                 url = content[9:idx]
                 o = urlparse(url)
                 list_query_value = o.query.split(';')
                 if list_query_value != []:
                     for query_value in list_query_value:
                         if query_value.startswith('center='):
                             geodata = query_value.lstrip('center=').rstrip(
                                 '&amp')
                             dic_my_activity_assistant[
                                 'latitude'] = geodata.split(',', 1)[0]
                             dic_my_activity_assistant[
                                 'longitude'] = geodata.split(',', 1)[1]
                         elif query_value.startswith('query='):
                             geodata = query_value.lstrip('query=')
                             dic_my_activity_assistant[
                                 'latitude'] = geodata.split(',', 1)[0]
                             dic_my_activity_assistant[
                                 'longitude'] = geodata.split(',', 1)[1]
                 if dic_my_activity_assistant['geodata_description'] == "":
                     dic_my_activity_assistant[
                         'geodata_description'] = content[idx + 2:content.
                                                          find('</a>')]
 def parse_gmail_log_title(dic_my_activity_gmail, gmail_logs):
     list_gmail_title_logs = TakeoutHtmlParser.find_log_title(gmail_logs)
     if list_gmail_title_logs != []:
         for content in list_gmail_title_logs:
             content = str(content).strip()
             dic_my_activity_gmail['service'] = content.split('>')[1].split(
                 '<br')[0]
Ejemplo n.º 9
0
 def parse_youtube_log_body(dic_my_activity_youtube, youtube_logs):
     list_youtube_event_logs = TakeoutHtmlParser.find_log_body(youtube_logs)
     if list_youtube_event_logs != []:
         idx = 0
         for content in list_youtube_event_logs:
             content = str(content).strip()
             content = content.replace(u'\xa0', ' ')
             if idx == 0:
                 if content == 'Searched for':
                     dic_my_activity_youtube['type'] = 'Search'
                 elif content.startswith('Watched'):
                     dic_my_activity_youtube['type'] = 'Watch'
                     if len(content) >= 8 and content.find(' ') >= 1:
                         dic_my_activity_youtube[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 content)
                 elif content.startswith('Visited'):
                     dic_my_activity_youtube['type'] = 'Visit'
                     if len(content) >= 8 and content.find(' ') >= 1:
                         dic_my_activity_youtube[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 content)
                 else:
                     dic_my_activity_youtube['type'] = content
             else:
                 if idx == 1:
                     if content.startswith('<a href="'):
                         idx2 = content.find('">')
                         keyword = content[idx2 + 2:content.find('</a>')]
                         dic_my_activity_youtube[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                         url = content[9:idx2]
                         url = unquote(url)
                         dic_my_activity_youtube[
                             'keyword_url'] = TakeoutHtmlParser.remove_special_char(
                                 url)
                 else:
                     if dic_my_activity_youtube['type'] == 'Watch':
                         if content.startswith('<a href="'):
                             idx2 = content.find('">')
                             channel_name = content[idx2 +
                                                    2:content.find('</a>')]
                             dic_my_activity_youtube[
                                 'channel_name'] = TakeoutHtmlParser.remove_special_char(
                                     channel_name)
                             url = content[9:idx2]
                             url = unquote(url)
                             dic_my_activity_youtube[
                                 'channel_url'] = TakeoutHtmlParser.remove_special_char(
                                     url)
                     if content.endswith('UTC'):
                         dic_my_activity_youtube[
                             'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime(
                                 content)
             idx += 1
 def parse_assistant_log_title(dic_my_activity_assistant, assistant_logs):
     list_assistant_title_logs = TakeoutHtmlParser.find_log_title(
         assistant_logs)
     if list_assistant_title_logs != []:
         for content in list_assistant_title_logs:
             content = str(content).strip()
             dic_my_activity_assistant['service'] = content.split(
                 '>')[1].split('<br')[0]
             dic_my_activity_assistant['used_device'] = 'mobile'
 def parse_ganalytics_log_title(dic_my_activity_google_analytics,
                                analytics_logs):
     list_analytics_title_logs = TakeoutHtmlParser.find_log_title(
         analytics_logs)
     if list_analytics_title_logs != []:
         for content in list_analytics_title_logs:
             content = str(content).strip()
             dic_my_activity_google_analytics['service'] = content.split(
                 '>')[1].split('<br')[0]
 def parse_assistant_log_body_text(dic_my_activity_assistant,
                                   assistant_logs, file_path):
     list_assistant_trained_logs = TakeoutHtmlParser.find_log_body_text(
         assistant_logs)
     if list_assistant_trained_logs != []:
         for content in list_assistant_trained_logs:
             content = str(content).strip()
             if content.startswith('<audio controls'):
                 attachment = content.split('>')[2].split('<')[0].lstrip(
                     'Audio file: ').split(' ')[0]
                 attachment_path = os.path.dirname(
                     file_path) + os.sep + attachment
                 if os.path.exists(attachment_path):
                     dic_my_activity_assistant['filepath'] = attachment_path
Ejemplo n.º 13
0
 def parse_history_logs(dic_browser_history, history_logs):
     for k, v in history_logs.items():
         if k == 'time_usec':
             dic_browser_history['timestamp'] = int(v) // 1000000
         elif k == 'page_transition':
             dic_browser_history['page_transition'] = v
         elif k == 'url':
             dic_browser_history[
                 'url'] = TakeoutHtmlParser.remove_special_char(unquote(v))
             o = urlparse(v)
             if o.netloc.startswith('m.'):
                 dic_browser_history['used_device'] = 'mobile'
         elif k == 'title':
             dic_browser_history['title'] = v.replace("\"", "\'")
         elif k == 'client_id':
             dic_browser_history['client_id'] = v
         elif k == 'favicon_url':
             dic_browser_history['favicon_url'] = v
Ejemplo n.º 14
0
    def parse_device_info(case):
        # print("input dir: ", case.takeout_android_device_configuration_service_path)
        list_target_files = os.listdir(
            case.takeout_android_device_configuration_service_path)
        if list_target_files == []:
            logger.error('Takeout data not exist.')
            return False
        for file_name in list_target_files:
            if file_name.startswith('Device-') == False:
                continue

            file_path = case.takeout_android_device_configuration_service_path + os.sep + file_name

            with open(file_path, 'r', encoding='utf-8') as f:
                file_contents = f.read()
                soup = BeautifulSoup(file_contents, 'lxml')
                list_device_conf_logs = TakeoutHtmlParser.find_category_title(
                    soup)
                print(list_device_conf_logs)
 def parse_assistant_log_body(dic_my_activity_assistant, assistant_logs):
     list_assistant_search_logs = TakeoutHtmlParser.find_log_body(
         assistant_logs)
     if list_assistant_search_logs != []:
         idx = 0
         for content in list_assistant_search_logs:
             content = str(content).strip()
             content = content.replace(u'\xa0', ' ')
             if idx == 0:
                 if content.startswith('Said'):
                     dic_my_activity_assistant['type'] = 'Search'
                     if len(content) >= 5 and content.find(' ') >= 1:
                         keyword = content.split(' ', 1)[1]
                         dic_my_activity_assistant[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                 elif content.startswith('Used'):
                     dic_my_activity_assistant['type'] = 'Use'
                     if len(content) >= 5 and content.find(' ') >= 1:
                         keyword = content.split(' ', 1)[1]
                         dic_my_activity_assistant[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                 elif content.startswith('Trained'):
                     dic_my_activity_assistant['type'] = 'Train'
                     if len(content) >= 8 and content.find(' ') >= 1:
                         keyword = content.split(' ', 1)[1]
                         dic_my_activity_assistant[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                 elif content.startswith('Selected') or content.startswith(
                         'Listened'):
                     dic_my_activity_assistant['type'] = 'Use'
                     if len(content) >= 9 and content.find(' ') >= 1:
                         dic_my_activity_assistant[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 content)
             else:
                 if idx == 1:
                     if content.startswith('<a href="'):
                         idx2 = content.find('">')
                         keyword = content[idx2 + 2:content.find('</a>')]
                         dic_my_activity_assistant[
                             'keyword'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                         url = content[9:idx2]
                         url = unquote(url)
                         dic_my_activity_assistant['keyword_url'] = url
                 elif content.endswith('UTC'):
                     dic_my_activity_assistant[
                         'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime(
                             content)
                 elif idx != 1 and content != '<br/>':
                     if content.startswith('<a href="'):
                         idx2 = content.find('">')
                         keyword = content[idx2 + 2:content.find('</a>')]
                         dic_my_activity_assistant[
                             'result'] = TakeoutHtmlParser.remove_special_char(
                                 keyword)
                         url = content[9:idx2]
                         url = unquote(url)
                         dic_my_activity_assistant['result_url'] = url
                         o = urlparse(url)
                     else:
                         dic_my_activity_assistant[
                             'result'] += TakeoutHtmlParser.remove_special_char(
                                 content)
             idx += 1
 def parse_video_search_log_title(dic_my_activity_video_search, video_search_logs):
     list_video_search_title_logs = TakeoutHtmlParser.find_log_title(video_search_logs)
     if list_video_search_title_logs != []:
         for content in list_video_search_title_logs:
             content = str(content).strip()
             dic_my_activity_video_search['service'] = content.split('>')[1].split('<br')[0]
Ejemplo n.º 17
0
 def parse_chrome_log_title(dic_my_activity_chrome, chrome_logs):
     list_chrome_title_logs = TakeoutHtmlParser.find_log_title(chrome_logs)
     if list_chrome_title_logs != []:
         for content in list_chrome_title_logs:
             content = str(content).strip()
             dic_my_activity_chrome['service'] = content.split('>', 1)[1].split('<br')[0]
Ejemplo n.º 18
0
    def parse_maps_log_body(dic_my_activity_maps, maps_logs):
        list_maps_event_logs = TakeoutHtmlParser.find_log_body(maps_logs)
        if list_maps_event_logs != []:
            idx = 0
            for content in list_maps_event_logs:
                content = str(content).strip()
                content = content.replace(u'\xa0', ' ')
                if idx == 0:
                    if content.startswith('<a href="'):
                        url = content[9:content.find('">')]
                        keyword = content.split('>')[1].split('</a')[0]
                        dic_my_activity_maps['keyword'] = keyword.replace(
                            "\"", "\'")

                        if keyword.startswith('View'):
                            dic_my_activity_maps['type'] = 'View'
                        else:
                            dic_my_activity_maps['type'] = 'Search'
                        url = unquote(url)
                        dic_my_activity_maps[
                            'keyword_url'] = TakeoutHtmlParser.remove_special_char(
                                url)
                        o = urlparse(url)
                        if o.path.startswith('/maps/@'):
                            list_value = o.path.lstrip('/maps/@').split(',')
                            if list_value != []:
                                latitude = list_value[0]
                                longitude = list_value[1]
                                dic_my_activity_maps[
                                    'keyword_latitude'] = latitude
                                dic_my_activity_maps[
                                    'keyword_longitude'] = longitude
                        elif o.path.find('@') >= 1:
                            list_value = o.path.split('@')[1].split(',')
                            if list_value != []:
                                latitude = list_value[0]
                                longitude = list_value[1]
                                dic_my_activity_maps[
                                    'keyword_latitude'] = latitude
                                dic_my_activity_maps[
                                    'keyword_longitude'] = longitude
                        elif o.query.find('sll=') >= 1:
                            list_value = o.query.split('sll=', 1)[1].split(',')
                            if list_value != []:
                                latitude = list_value[0]
                                longitude = list_value[1].split('&')[0]
                                dic_my_activity_maps[
                                    'keyword_latitude'] = latitude
                                dic_my_activity_maps[
                                    'keyword_longitude'] = longitude
                    else:
                        if content == 'Searched for':
                            dic_my_activity_maps['type'] = 'Search'
                        elif content.startswith('Shared'):
                            dic_my_activity_maps['type'] = 'Share'
                        elif content.startswith('Viewed'):
                            dic_my_activity_maps['type'] = 'View'
                            if content == 'Viewed For you':
                                dic_my_activity_maps[
                                    'keyword'] = TakeoutHtmlParser.remove_special_char(
                                        content)
                        elif content == 'Used Maps':
                            dic_my_activity_maps['type'] = 'Use'
                            dic_my_activity_maps[
                                'keyword'] = TakeoutHtmlParser.remove_special_char(
                                    content)
                        elif content.startswith('Answered'):
                            dic_my_activity_maps['type'] = 'Answer'
                            dic_my_activity_maps[
                                'keyword'] = TakeoutHtmlParser.remove_special_char(
                                    content)
                        else:
                            dic_my_activity_maps['type'] = content
                else:
                    if idx == 1:
                        if content.startswith('<a href="'):
                            idx2 = content.find('">')
                            keyword = content[idx2 + 2:content.find('</a>')]
                            dic_my_activity_maps[
                                'keyword'] = TakeoutHtmlParser.remove_special_char(
                                    keyword)
                            url = content[9:idx2]
                            url = unquote(url)
                            dic_my_activity_maps[
                                'keyword_url'] = TakeoutHtmlParser.remove_special_char(
                                    url)
                            o = urlparse(url)
                            if o.path.startswith(
                                    '/maps/') and o.path.find('@') >= 1:
                                list_value = o.path.split('@')[1].split(',')
                                if list_value != []:
                                    latitude = list_value[0]
                                    longitude = list_value[1]
                                    dic_my_activity_maps[
                                        'keyword_latitude'] = latitude
                                    dic_my_activity_maps[
                                        'keyword_longitude'] = longitude
                            elif o.query.find('sll=') >= 1:
                                list_value = o.query.split('sll=',
                                                           1)[1].split(',')
                                if list_value != []:
                                    latitude = list_value[0]
                                    longitude = list_value[1].split('&')[0]
                                    dic_my_activity_maps[
                                        'keyword_latitude'] = latitude
                                    dic_my_activity_maps[
                                        'keyword_longitude'] = longitude
                    else:
                        if content.endswith('UTC'):
                            dic_my_activity_maps[
                                'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime(
                                    content)
                        elif idx == 4 and dic_my_activity_maps[
                                'type'] == '1 notification':
                            dic_my_activity_maps[
                                'keyword'] = TakeoutHtmlParser.remove_special_char(
                                    content)
                idx += 1