def proc_fb_others_post_standard(self, path):
     '''
     This function merge (where the timestamp is equivaent) the two function above into one dataframe 
     containing all informations
     '''
     df1 = self.others_post_post(path)
     df2 = self.others_post_title(path)
     
     df3 = pd.merge(df1, df2, on='timestamp')
     df3['date'] = df3['timestamp']
     main_df = main_transfo_timestamp_10(df3, 'Facebook', 'others post')
     
     if ALL_INDEX: 
         main_df['post'] = df3['post']
         main_df['title']= df3['title']
         main_df = main_df[['date','type','label','title','post','Year','Month','Day','Hour']]
     
     if ALL_GENERAL:
         main_df['name'] = df3.title
         main_df['content'] = df3.post
         main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']]
     
     main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
     
     
     return main_df
 def proc_fb_others_post_without_title(self, path):
     '''
     Some posts have no title (links for exemple) this function compare the dataframe returned from others_post_post
     and the main dataframe containing all infos (ie. title and posts)
     --> this is done by a left joining on the date, the merge column says if the date is in both dataframe or
         only in the left one, all which are in both are abandonned and only posts without title are returned
     '''
     df1 = self.others_post_post(path)
     df1['date'] = df1['timestamp']
     df1['date'] = df1.date.apply(lambda x : pd.to_datetime(x, unit='s'))
     
     df2 = self.proc_fb_others_post_standard(path)
     
     df_all = df1.merge(df2.drop_duplicates(), on='date', how='left', indicator=True)
     df_all = df_all[df_all['_merge']== 'left_only']
     df_all['date'] = df_all['timestamp'].astype(str)
     df_all['name'] = df_all.date.apply(lambda x: 'NaN')
     main_df = main_transfo_timestamp_10(df_all, 'Facebook', 'others post')
     
     
     if ALL_INDEX:
         main_df['post'] = df_all['post_x']
         main_df = main_df[['date','type','label','post','Year','Month','Day','Hour']]
     
     if ALL_GENERAL:
         main_df['name'] = df_all.name
         main_df['content'] = df_all.post_x
         main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']]
     
     main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
         
     return main_df
Exemple #3
0
    def read(self):
        data = self.open_photo_file(self.path)
        outputdict = self.give_a_photo_dict(data)

        outputdict.pop('name', None)
        outputdict.pop('author', None)
        outputdict.pop('timestamp', None)
        outputdict.pop('comment', None)
        outputdict.pop('title', None)
        outputdict.pop('description', None)
        outputdict.pop('last_modified_timestamp', None)
        outputdict.pop('media_metadata', None)

        df = pd.DataFrame.from_dict(outputdict)
        df['date'] = df['creation_timestamp']
        df['name'] = df.date.apply(lambda x: 'NaN')
        main_df = main_transfo_timestamp_10(df, 'Facebook', 'photo')

        if ALL_INDEX:
            main_df['uri'] = df['uri']
            main_df = main_df[[
                'date', 'type', 'label', 'uri', 'Year', 'Month', 'Day', 'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df.name
            main_df['content'] = df.uri
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
 def read(self):
     data = pd.read_json(self.path, encoding='utf-8')
     df = pd.DataFrame(list(data['groups_joined']))
     df['date'] = df['timestamp']
     df['name'] = df.date.apply(lambda x: 'NaN')
     main_df = main_transfo_timestamp_10(df, 'Facebook', 'group')
     
     title_begin_appareance = 'Vous avez arrêté d’être membre de '
     list_type = []
     for i in df['title']:
         if i[:len(title_begin_appareance)] == title_begin_appareance:
             list_type.append('group leaved')
         else: 
             list_type.append('group joined')
     
     main_df['label'] = list_type
     
     if ALL_INDEX: 
         main_df['title'] = df['title']
         main_df = main_df[['date','type','label','title','Year','Month','Day','Hour']]
     
     if ALL_GENERAL:
         main_df['name'] = df.name
         main_df['content'] = df.title
         main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']]
     
     main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
     
     self.df = main_df
 def read(self):
     data = pd.read_json(self.path, encoding = 'utf-8')
     df = pd.DataFrame(list(data['location_history']))
     
     outputdict = {}
     for dic in df['coordinate']:
         for key, value in dic.items():
             outputdict[key] = outputdict.get(key, []) + [value]
     
     df['longitude']= outputdict['longitude']
     df['latitude'] = outputdict['latitude']
     df['date'] = df['creation_timestamp']
     df['content'] = df.date.apply(lambda x: 'NaN')
     
     main_df = main_transfo_timestamp_10(df, 'Facebook', 'location')
     
     main_df['name'] = df['name']
     main_df['latitude'] = df['latitude']
     main_df['longitude'] = df['longitude']
         
     main_df = main_df[['date','type','label','name','latitude','longitude','Year','Month','Day','Hour']]
     
     main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
         
             
     self.df = main_df 
    def read(self):
        data = pd.read_json(self.path)
        df = pd.DataFrame(list(data['searches']))

        outputdict = {}
        for lis in df['attachments']:
            for dic in lis:
                for key, value in dic.items():
                    for dic_2 in value:
                        for k2, v2 in dic_2.items():
                            outputdict[k2] = outputdict.get(k2, []) + [v2]

        df['search'] = outputdict['text']
        df['date'] = df['timestamp']
        df['name'] = df.date.apply(lambda x: 'NaN')
        main_df = main_transfo_timestamp_10(df, 'Facebook', 'search_history')

        if ALL_INDEX:
            main_df['search'] = df['search']
            main_df = main_df[[
                'date', 'type', 'label', 'search', 'Year', 'Month', 'Day',
                'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df.name
            main_df['content'] = df.search
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
 def proc_fb_others_post_without_posts(self, path):
     '''
     Same as above but for post "with no post" (meaning a picture or a link for example --> xxx a poste une 
     photo sur votre journal)
     '''
     
     df1 = self.others_post_title(path)
     df1['date'] = df1['timestamp']
     df1['date'] = df1.date.apply(lambda x : pd.to_datetime(x, unit='s'))
     
     df2 = self.proc_fb_others_post_standard(path)
     
     df_all = df1.merge(df2.drop_duplicates(), on='date', how='left', indicator=True)
     df_all = df_all[df_all['_merge']== 'left_only']
     df_all['date'] = df_all['timestamp'].astype(str)
     df_all['content'] = df_all.date.apply(lambda x: 'NaN')
     main_df = main_transfo_timestamp_10(df_all, 'Facebook', 'others post')
     
     
     if ALL_INDEX:
         main_df['title'] = df_all['title_x']
         main_df = main_df[['date','type','label','title','Year','Month','Day','Hour']]
     
     if ALL_GENERAL:
         main_df['name'] = df_all.title_x
         main_df['content'] = df_all.content
         main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']]
     
     main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
     
     return main_df
    def read(self):
        '''
        This function convert the json file given by facebook about posts (they do not differenciate post on your 
        wall, on others wall, statut update or 'mood' --> it was not important in my analysis but could be for other
        usage so take note)
        1) Tansform timestampe (10 digits) in a datetime(ns) format, add a type and a label 
        '''
        df = pd.read_json(self.path, convert_dates=False, encoding='utf-8')
        df = df[pd.isnull(df['attachments'])]
        df['date'] = df['timestamp'].astype(int)
        df['name'] = df.date.apply(lambda x: 'NaN')
        main_df = main_transfo_timestamp_10(df, 'Facebook', 'post')

        if ALL_INDEX:
            main_df['title'] = df['title']
            main_df = main_df[[
                'date', 'type', 'label', 'title', 'Year', 'Month', 'Day',
                'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df.name
            main_df['content'] = df.title
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
Exemple #9
0
 def read(self):
     data = self.open_photo_file(self.path)
     
     outputdict = {}
     for key, value in data.items():
         if isinstance(value, dict):
             for k2, v2 in value.items():
                 outputdict[k2] = outputdict.get(k2, []) + [v2]
         
         else: 
             outputdict[key] = outputdict.get(key, []) + [value]
     outputdict.pop('name', None)
     outputdict.pop('photos',None)
     outputdict.pop('comments',None)
     outputdict.pop('description',None)
     
     df = pd.DataFrame.from_dict(outputdict)
     df['date'] = df['last_modified_timestamp']
     df['name'] = df.date.apply(lambda x: 'NaN')
     main_df = main_transfo_timestamp_10(df, 'Facebook', 'photo')
     
     if ALL_INDEX: 
         main_df['uri'] = df['uri']
         main_df = main_df[['date','type','label','uri','Year','Month','Day','Hour']]
     
     if ALL_GENERAL:
         main_df['name'] = df.name
         main_df['content'] = df.uri
         main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']]
     
     main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
     
     self.df = main_df 
Exemple #10
0
    def read(self):
        '''
        'Attachment' contains the path to the sticker used on the comment, it can be useful for some usages. This 
        function keep only raws where there is an attachment. Then collect the timestamp in a list. And nested
        dictionaries containing the path we described above are also put in a list. Nested dictionaries are 'denested'
        to keep only the path in a dictionary. From this last dictionary, a dataframe is created and the date is added
        to it. Then the same operation on date, type and label are performed.
        nb if all_index = True, path to the sticker used is added 
        '''

        data = self.read_json_likes(self.path)
        df = pd.DataFrame(list(data['reactions']))
        df = df[df['attachments'].notna()]

        list_date = []
        for i in df.iterrows():
            list_date.append(i[1][0])

        list_link = []
        for i in df.iterrows():
            list_link.append(i[1][3][0])

        outputdict = {}
        for dic in list_link:
            for key, value in dic.items():
                if isinstance(value, list):
                    value_dic = value[0]
                    for k2, v2 in value_dic.items():
                        if isinstance(v2, dict):
                            for k3, v3 in v2.items():
                                outputdict[k3] = outputdict.get(k3, []) + [v3]

        df = pd.DataFrame.from_dict(outputdict)
        df['date'] = list_date
        df['name'] = df.date.apply(lambda x: 'NaN')
        df['content'] = df.uri
        main_df = main_transfo_timestamp_10(df, 'Facebook', 'sticker comment')

        if ALL_INDEX:
            main_df['uri'] = df['uri']
            main_df = main_df[[
                'date', 'type', 'label', 'uri', 'Year', 'Month', 'Day', 'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df['name']
            main_df['content'] = df['content']
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
Exemple #11
0
    def read(self):
        '''
        This function return a dataframe with all relevant informations about Facebook comments: 
        1) list all main dictionaries from the 'brut' dataframe and keep only raws where there is no attachment (
        ie without path to a picture) and with 'data' ('data' contains timestamp, the comment and the author (and 
        for a few of them the group where the comment was written))
        2) a list containing all 'data' (dictionary of dictionaries) is created 
        3) from these nested dictionaries the sub-dictionaries are extracted and 'merged' under a dictionary with 
        three 'super keys' (timestamp, comment, author) --> nb: 'group' it's not relevant and 'can't be transform 
        into a dataframe' 
        4) Timestamp is transformed into a datetime(ns), type and label are added 
        5) If all_index = True, the comment is added in the dataframe 
        '''

        data = self.read_json_comments(self.path)
        df = pd.DataFrame(list(data['comments']))
        df = df[pd.isnull(df['attachments'])]
        df = df[df['data'].notna()]

        list_data = []
        for i in df.iterrows():
            list_data.append(i[1][1])

        outputdict = self.drop_nesteddictionary_fb_comment(list_data)

        df1 = pd.DataFrame.from_dict(outputdict)
        df1['date'] = df1['timestamp'].astype(int)
        main_df = main_transfo_timestamp_10(df1, 'Facebook', 'comment')

        if ALL_INDEX:
            main_df['comment'] = df1['comment']
            main_df['author'] = df1['author']
            main_df = main_df[[
                'date', 'type', 'label', 'author', 'comment', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df1.author
            main_df['content'] = df1.comment
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
    def read(self):
        '''
        Same procedure as 'above' (posts) but for external content (url) post on facebook. 
        Nb: it could be the same function as above but nested dictionaries are encoded in other way 
        Idea: is it possible to create a 'big' function in order to destructured nested dictionary 
        '''
        df = pd.read_json(self.path, convert_dates=False, encoding='utf-8')
        df = df[df['attachments'].notna()]

        returned = lists_creator(df, 'external_context')
        list_date = returned[0]
        list_att = returned[1]

        outputdict = {}
        for lis in list_att:
            for dic in lis:
                for key, value in dic.items():
                    for list_dic in value:
                        for k2, v2 in list_dic.items():
                            for k3, v3 in v2.items():
                                outputdict[k3] = outputdict.get(k3, []) + [v3]
        outputdict.pop('name', None)
        outputdict.pop('source', None)

        df1 = pd.DataFrame.from_dict(outputdict)
        df1['date'] = list_date
        df1['name'] = df1.date.apply(lambda x: 'NaN')
        main_df = main_transfo_timestamp_10(df1, 'Facebook',
                                            'post external content')

        if ALL_INDEX:
            main_df['url'] = df1['url']
            main_df = main_df[[
                'date', 'type', 'label', 'url', 'Year', 'Month', 'Day', 'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df1.name
            main_df['content'] = df1.url
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
Exemple #13
0
    def read(self):
        data = self.read_json_comments(self.path)
        df = pd.DataFrame(list(data['comments']))
        df = df[df['attachments'].notna()]
    
        list_date = []
        for i in df.iterrows():
            list_date.append(i[1][0])
    
        list_attachments = []
        for i in df.iterrows():
            list_attachments.append(i[1][3])

        outputdict = {}
        for lis in list_attachments:
            for dic in lis:
                for key, value in dic.items():
                    if isinstance(value, list):
                        value_dic = value[0]
                        for k2, v2 in value_dic.items(): 
                            for k3, v3 in v2.items():
                                outputdict[k3] = outputdict.get(k3, []) + [v3]
                                
        outputdict.pop('creation_timestamp',None)
        outputdict.pop('media_metadata',None)
        outputdict.pop('title',None)
    
        df = pd.DataFrame.from_dict(outputdict)
        df['date'] = list_date
        df['content'] = df.uri
        df['name'] = df.content.apply(lambda x: 'NaN')
        main_df = main_transfo_timestamp_10(df, 'Facebook', 'sticker comment')
    
        if ALL_INDEX: 
            main_df['uri'] = df['uri']
            main_df = main_df[['date','type','label','uri','Year','Month','Day','Hour']]
    
        if ALL_GENERAL:
            main_df['name'] = df.name
            main_df['content'] = df.content
            main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']]
    
        main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
    
        self.df = main_df
    def read(self):
        df = pd.read_json(self.path, convert_dates=False, encoding='utf-8')
        df = df[df['attachments'].notna()]

        returned = lists_creator(df, 'place')
        list_date = returned[0]
        list_att = returned[1]

        outputdict = {}
        for lis in list_att:
            for lis_2 in lis:
                for key, value in lis_2.items():
                    for lis_3 in value:
                        for k2, v2 in lis_3.items():
                            for k3, v3 in v2.items():
                                outputdict[k3] = outputdict.get(k3, []) + [v3]
        outputdict.pop('url', None)

        new_outputdict = {a: list(set(b)) for a, b in outputdict.items()}

        df1 = pd.DataFrame.from_dict(new_outputdict)
        df1['date'] = list_date
        df1['content'] = df1.date.apply(lambda x: 'NaN')
        main_df = main_transfo_timestamp_10(df1, 'Facebook', 'place')

        if ALL_INDEX:
            main_df['name'] = df1['name']
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'Year', 'Month', 'Day', 'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df1.name
            main_df['content'] = df1.content
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
Exemple #15
0
    def read(self):
        data = self.read_json_likes(self.path)
        df = pd.DataFrame(list(data['page_likes']))
        df['date'] = df['timestamp']
        df['content'] = df.date.apply(lambda x: 'NaN')
        main_df = main_transfo_timestamp_10(df, 'Facebook', 'like page')

        if ALL_INDEX:
            main_df['page'] = df['name']
            main_df = main_df[[
                'date', 'type', 'label', 'page', 'Year', 'Month', 'Day', 'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df.name
            main_df['content'] = df.content
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
    def read(self):
        '''
        Return a dataframe prom the pictures post on facebook, with the path to the picture in the Facebook file
        and the IP address used when uploading the picture.
        Nothing revolutionary, a new kind of nested dictionaries with multiple layer f list inside(see outputdict).
        NB: the ip address was inside a dictionnary and is taken out and added as 'main' column
        '''
        df = pd.read_json(self.path, convert_dates=False, encoding='utf-8')
        df = df[df['attachments'].notna()]

        returned = lists_creator(df, 'media')
        list_att = returned[1]

        outputdict = {}
        for i in list_att:
            for j in i:
                for key, value in j.items():
                    for k in value:
                        for k2, v2 in k.items():
                            if k2 == 'media':
                                for k3, v3 in v2.items():
                                    outputdict[k3] = outputdict.get(k3,
                                                                    []) + [v3]

        outputdict.pop('description', None)

        dict_ip = {}
        for i in outputdict['media_metadata']:
            for key, value in i.items():
                if isinstance(value, dict):
                    for k2, v2 in value.items():
                        dict_ip[k2] = dict_ip.get(k2, []) + [v2]

                else:
                    dict_ip[key] = dict_ip.get(key, []) + [value]
        dict_ip.pop('orientation', None)

        list_ip = list(dict_ip.values())

        df1 = pd.DataFrame.from_dict(outputdict)
        df1['ip'] = list_ip[0]
        df1['date'] = df1['creation_timestamp'].astype(int)
        main_df = main_transfo_timestamp_10(df1, 'Facebook', 'post picture')

        if ALL_INDEX:
            main_df['uri'] = df1['uri']
            main_df['ip'] = df1['ip']
            main_df = main_df[[
                'date', 'type', 'label', 'uri', 'ip', 'Year', 'Month', 'Day',
                'Hour'
            ]]

        if ALL_GENERAL:
            main_df['name'] = df1.ip
            main_df['content'] = df1.uri
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
 def proc_fb_others_post_where_everything_is_missing(self, path):
     '''
     Return a dataframe of the row still untreated (ie without title and post):
     1) list_timestamp_rebels collect timestamp of rows with 'attachments', some posts with post have also
     attachments inside 
     2) concat_all_for_a_list return all the timestamps 'already treated', main_list keeps only timestamp which
     haven't been treated yet 
     3) with all conditions implemented, nested dictionaries are 'denested' and an ordonated dictionary is
     created
     --> nb: due to only one observation, the dictionary may be subjected to problem for other users (may lead 
     to some changes)
     4) a dataframe is produced from the dictionary and treated as usual 
     '''
     
     data = pd.read_json(path, encoding = 'utf-8')
     df = pd.DataFrame(list(data["wall_posts_sent_to_you"])).T
     
     list_data = []
     for i in df.iterrows():
         list_data.append(i[1][0])
     
     list_timestamp_rebels = []
     for lis in list_data:
         if self.gen_3(lis):
             for key, value in lis.items():
                 if key == 'timestamp':
                     list_timestamp_rebels.append(value)
                          
     list_timestamp = self.concat_all_for_a_list(path)
                          
     main_list = list(set(list_timestamp_rebels)-set(list_timestamp))
     
     outputdict = {}
     for lis in list_data:
         if self.gen_3(lis):
             if self.gen_4(lis, main_list):
                 for key, value in lis.items():
                     if isinstance (value, list):
                         for lis_2 in value:
                             for k2, v2 in lis_2.items(): 
                                 if isinstance (v2, list):
                                     for lis_3 in v2:
                                         for k3, v3 in lis_3.items():
                                             for k4, v4, in v3.items():
                                                 outputdict[k4] = outputdict.get(k4, []) + [v4]
                                             
                                 else:
                                     outputdict[k2] = outputdict.get(k2, []) + [v2]
                                 
                     else:
                         outputdict[key] = outputdict.get(key, []) + [value]
     
     df1 = pd.DataFrame.from_dict(outputdict)
     df1['date'] = df1['timestamp']
     df1['name'] =df1.date.apply(lambda x: 'NaN')
     main_df = main_transfo_timestamp_10(df1, 'Facebook', 'others post')
     
     if ALL_INDEX: 
         main_df['url'] = df1['url']
         main_df = main_df[['date','type','label','url','Year','Month','Day','Hour']]
     
     if ALL_GENERAL:
         main_df['name'] = df1.name
         main_df['content'] = df1.url
         main_df = main_df[['date','type','label','name','content','Year','Month','Day']]
     
     main_df.sort_values(["date"],axis=0,ascending=True,inplace=True)
     
     return main_df             
Exemple #18
0
    def read(self):
        '''
        This function returns a dataframe containing the date of frienship and (if wanted) the name of the friend
        --> the same structure is used for all 'type of friend interactions' so this function can be used for 
            all of them 
        --> for doing so type_data has to be introduced (name of the columns read by read_json: 'friends','received_requests'
            'rejected_requests','deleted_friends', 'sent_requests') this will also be the type of data in the main 
            dataframe
        --> the 'simple' friends.json add the email address of some friends, can be added with friends_with_contacts
        '''
        text = self.path
        data = pd.read_json(self.path, encoding='utf-8')
        if re.search('received_friend_requests.json', text):
            label_data = 'received_requests'

        elif re.search('rejected_friend_requests.json', text):
            label_data = 'rejected_requests'

        elif re.search('removed_friends', text):
            label_data = 'deleted_friends'

        elif re.search('sent_friend_requests.json', text):
            label_data = 'sent_requests'

        else:
            label_data = 'friends'

        df = pd.DataFrame(list(data[label_data]))
        df['date'] = df['timestamp']
        main_df = main_transfo_timestamp_10(df, 'Facebook', label_data)

        if ALL_INDEX:
            main_df['name'] = df['name']
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'Year', 'Month', 'Day', 'Hour'
            ]]

        if FRIENDS_WITH_CONTACTS:
            df1 = df[df['contact_info'].notna()]
            main_df = main_transfo_timestamp_10(df1, 'Facebook', 'friend')
            main_df['name'] = df1['name']
            main_df['email'] = df1['contact_info']
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'email', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        if ALL_GENERAL_FRIENDS:
            main_df['name'] = df.name
            main_df['content'] = df.contact_info
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        if ALL_GENERAL_ELSE:
            main_df['name'] = df.name
            main_df['content'] = df.name.apply(lambda x: 'NaN')
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df
Exemple #19
0
    def data_fb_pictures_sent_by_msg(self, path):
        '''
        This function permit to refind uris of pictures, videos and files exchanged by msg.
        --> we have to check if in the json file there is such type of files, we use dictionary's key to do this work
        --> if there is an existence all useful informations are transformed into a dataframe
        --> if multiple kind of medias had been exchanged in a conversation, we concatenated them (it's why we have
            to check if a dataframe of a certain type exists or not)
        --> dataframe is cleaned 
        --> if a dataframe exist it is returned, else the function returns 0
        '''
        with open(path, encoding='utf-8') as json_data:
            data = json.load(json_data)

        outputdict = self.get_adict(data)

        for key in outputdict:
            if key == 'photos':
                outputdict_pics = {}
                for i in outputdict['photos']:
                    for dic in i:
                        for key, value in dic.items():
                            outputdict_pics[key] = outputdict_pics.get(
                                key, []) + [value]
                df = pd.DataFrame.from_dict(outputdict_pics)
            else:
                continue

        for key in outputdict:
            if key == 'files':
                outputdict_file = {}
                for i in outputdict['files']:
                    for dic in i:
                        for key, value in dic.items():
                            outputdict_file[key] = outputdict_file.get(
                                key, []) + [value]
                df2 = pd.DataFrame.from_dict(outputdict_file)
            else:
                continue

        for key in outputdict:
            if key == 'videos':
                outputdict_video = {}
                for i in outputdict['videos']:
                    for dic in i:
                        for key, value in dic.items():
                            outputdict_video[key] = outputdict_video.get(
                                key, []) + [value]
                outputdict_video.pop('thumbnail')
                df3 = pd.DataFrame.from_dict(outputdict_video)

            else:
                continue

        if 'df' in locals():
            if 'df2' in locals():
                df = pd.concat([df, df2])
            if 'df3' in locals():
                df = pd.concat([df, df3])
        else:
            if 'df2' in locals():
                df = df2
                if 'df3' in locals():
                    df = pd.concat([df, df3])
            elif 'df3' in locals():
                df = df3

        if 'df' in locals():
            df['date'] = df['creation_timestamp'].astype(int)
            df['name'] = df.date.apply(lambda x: 'NaN')
            main_df = main_transfo_timestamp_10(df, 'Facebook', 'msg media')
            main_df = main_df[[
                'date', 'type', 'label', 'Year', 'Month', 'Day', 'Hour'
            ]]

            if ALL_INDEX:
                main_df['uri'] = df['uri']
                main_df = main_df[[
                    'date', 'type', 'label', 'uri', 'Year', 'Month', 'Day',
                    'Hour'
                ]]

            if ALL_GENERAL:
                main_df['name'] = df.name
                main_df['content'] = df.uri
                main_df = main_df[[
                    'date', 'type', 'label', 'name', 'content', 'Year',
                    'Month', 'Day', 'Hour'
                ]]

            main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

            return main_df, 1

        else:
            return 0, 0
Exemple #20
0
    def read(self):
        '''
        Same process as most of function above but for items saled on Facebook.
        '''
        df = pd.read_json(self.path, convert_dates=False, encoding='utf-8')
        df = df[df['attachments'].notna()]

        returned = lists_creator(df, 'for_sale_item')
        list_date = returned[0]
        list_att = returned[1]

        outputdict = {}
        for lis in list_att:
            for lis_2 in lis:
                for key, value in lis_2.items():
                    for lis_3 in value:
                        for k2, v2 in lis_3.items():
                            for k3, v3 in v2.items():
                                if isinstance(v3, dict):
                                    for k4, v4 in v3.items():
                                        if isinstance(v4, dict):
                                            for k5, v5 in v4.items():
                                                outputdict[
                                                    k5] = outputdict.get(
                                                        k5, []) + [v5]

                                        else:
                                            outputdict[k4] = outputdict.get(
                                                k4, []) + [v4]
                                else:
                                    outputdict[k3] = outputdict.get(k3,
                                                                    []) + [v3]
        outputdict.pop('description', None)
        outputdict.pop('title')

        df1 = pd.DataFrame.from_dict(outputdict)
        df1['date'] = list_date
        main_df = main_transfo_timestamp_10(df1, 'Facebook', 'for sale item')

        if ALL_INDEX:
            main_df['price'] = df1['price']
            main_df['seller'] = df1['price']
            main_df['category'] = df1['category']
            main_df['marketplace'] = df1['marketplace']
            main_df['location'] = df1['name']
            main_df['latitude'] = df1['latitude']
            main_df['longitude'] = df1['longitude']
            main_df['uri'] = df1['uri']
            main_df['ip'] = df1['upload_ip']
            main_df = main_df[['date','type','label','price','seller','category','marketplace',\
                               'location','latitude','longitude','uri','ip','Year','Month','Day','Hour']]
        if ALL_GENERAL:
            main_df['name'] = df1.marketplace
            main_df['content'] = df1.uri
            main_df = main_df[[
                'date', 'type', 'label', 'name', 'content', 'Year', 'Month',
                'Day', 'Hour'
            ]]

        main_df.sort_values(["date"], axis=0, ascending=True, inplace=True)

        self.df = main_df