def proc_fb_others_post_standard(self, path): ''' This function merge (where the timestamp is equivaent) the two function above into one dataframe containing all informations ''' df1 = self.others_post_post(path) df2 = self.others_post_title(path) df3 = pd.merge(df1, df2, on='timestamp') df3['date'] = df3['timestamp'] main_df = main_transfo_timestamp_10(df3, 'Facebook', 'others post') if ALL_INDEX: main_df['post'] = df3['post'] main_df['title']= df3['title'] main_df = main_df[['date','type','label','title','post','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df3.title main_df['content'] = df3.post main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) return main_df
def proc_fb_others_post_without_title(self, path): ''' Some posts have no title (links for exemple) this function compare the dataframe returned from others_post_post and the main dataframe containing all infos (ie. title and posts) --> this is done by a left joining on the date, the merge column says if the date is in both dataframe or only in the left one, all which are in both are abandonned and only posts without title are returned ''' df1 = self.others_post_post(path) df1['date'] = df1['timestamp'] df1['date'] = df1.date.apply(lambda x : pd.to_datetime(x, unit='s')) df2 = self.proc_fb_others_post_standard(path) df_all = df1.merge(df2.drop_duplicates(), on='date', how='left', indicator=True) df_all = df_all[df_all['_merge']== 'left_only'] df_all['date'] = df_all['timestamp'].astype(str) df_all['name'] = df_all.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df_all, 'Facebook', 'others post') if ALL_INDEX: main_df['post'] = df_all['post_x'] main_df = main_df[['date','type','label','post','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df_all.name main_df['content'] = df_all.post_x main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) return main_df
def read(self): data = self.open_photo_file(self.path) outputdict = self.give_a_photo_dict(data) outputdict.pop('name', None) outputdict.pop('author', None) outputdict.pop('timestamp', None) outputdict.pop('comment', None) outputdict.pop('title', None) outputdict.pop('description', None) outputdict.pop('last_modified_timestamp', None) outputdict.pop('media_metadata', None) df = pd.DataFrame.from_dict(outputdict) df['date'] = df['creation_timestamp'] df['name'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'photo') if ALL_INDEX: main_df['uri'] = df['uri'] main_df = main_df[[ 'date', 'type', 'label', 'uri', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.uri main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def read(self): data = pd.read_json(self.path, encoding='utf-8') df = pd.DataFrame(list(data['groups_joined'])) df['date'] = df['timestamp'] df['name'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'group') title_begin_appareance = 'Vous avez arrêté dâêtre membre de ' list_type = [] for i in df['title']: if i[:len(title_begin_appareance)] == title_begin_appareance: list_type.append('group leaved') else: list_type.append('group joined') main_df['label'] = list_type if ALL_INDEX: main_df['title'] = df['title'] main_df = main_df[['date','type','label','title','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.title main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) self.df = main_df
def read(self): data = pd.read_json(self.path, encoding = 'utf-8') df = pd.DataFrame(list(data['location_history'])) outputdict = {} for dic in df['coordinate']: for key, value in dic.items(): outputdict[key] = outputdict.get(key, []) + [value] df['longitude']= outputdict['longitude'] df['latitude'] = outputdict['latitude'] df['date'] = df['creation_timestamp'] df['content'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'location') main_df['name'] = df['name'] main_df['latitude'] = df['latitude'] main_df['longitude'] = df['longitude'] main_df = main_df[['date','type','label','name','latitude','longitude','Year','Month','Day','Hour']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) self.df = main_df
def read(self): data = pd.read_json(self.path) df = pd.DataFrame(list(data['searches'])) outputdict = {} for lis in df['attachments']: for dic in lis: for key, value in dic.items(): for dic_2 in value: for k2, v2 in dic_2.items(): outputdict[k2] = outputdict.get(k2, []) + [v2] df['search'] = outputdict['text'] df['date'] = df['timestamp'] df['name'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'search_history') if ALL_INDEX: main_df['search'] = df['search'] main_df = main_df[[ 'date', 'type', 'label', 'search', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.search main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def proc_fb_others_post_without_posts(self, path): ''' Same as above but for post "with no post" (meaning a picture or a link for example --> xxx a poste une photo sur votre journal) ''' df1 = self.others_post_title(path) df1['date'] = df1['timestamp'] df1['date'] = df1.date.apply(lambda x : pd.to_datetime(x, unit='s')) df2 = self.proc_fb_others_post_standard(path) df_all = df1.merge(df2.drop_duplicates(), on='date', how='left', indicator=True) df_all = df_all[df_all['_merge']== 'left_only'] df_all['date'] = df_all['timestamp'].astype(str) df_all['content'] = df_all.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df_all, 'Facebook', 'others post') if ALL_INDEX: main_df['title'] = df_all['title_x'] main_df = main_df[['date','type','label','title','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df_all.title_x main_df['content'] = df_all.content main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) return main_df
def read(self): ''' This function convert the json file given by facebook about posts (they do not differenciate post on your wall, on others wall, statut update or 'mood' --> it was not important in my analysis but could be for other usage so take note) 1) Tansform timestampe (10 digits) in a datetime(ns) format, add a type and a label ''' df = pd.read_json(self.path, convert_dates=False, encoding='utf-8') df = df[pd.isnull(df['attachments'])] df['date'] = df['timestamp'].astype(int) df['name'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'post') if ALL_INDEX: main_df['title'] = df['title'] main_df = main_df[[ 'date', 'type', 'label', 'title', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.title main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def read(self): data = self.open_photo_file(self.path) outputdict = {} for key, value in data.items(): if isinstance(value, dict): for k2, v2 in value.items(): outputdict[k2] = outputdict.get(k2, []) + [v2] else: outputdict[key] = outputdict.get(key, []) + [value] outputdict.pop('name', None) outputdict.pop('photos',None) outputdict.pop('comments',None) outputdict.pop('description',None) df = pd.DataFrame.from_dict(outputdict) df['date'] = df['last_modified_timestamp'] df['name'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'photo') if ALL_INDEX: main_df['uri'] = df['uri'] main_df = main_df[['date','type','label','uri','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.uri main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) self.df = main_df
def read(self): ''' 'Attachment' contains the path to the sticker used on the comment, it can be useful for some usages. This function keep only raws where there is an attachment. Then collect the timestamp in a list. And nested dictionaries containing the path we described above are also put in a list. Nested dictionaries are 'denested' to keep only the path in a dictionary. From this last dictionary, a dataframe is created and the date is added to it. Then the same operation on date, type and label are performed. nb if all_index = True, path to the sticker used is added ''' data = self.read_json_likes(self.path) df = pd.DataFrame(list(data['reactions'])) df = df[df['attachments'].notna()] list_date = [] for i in df.iterrows(): list_date.append(i[1][0]) list_link = [] for i in df.iterrows(): list_link.append(i[1][3][0]) outputdict = {} for dic in list_link: for key, value in dic.items(): if isinstance(value, list): value_dic = value[0] for k2, v2 in value_dic.items(): if isinstance(v2, dict): for k3, v3 in v2.items(): outputdict[k3] = outputdict.get(k3, []) + [v3] df = pd.DataFrame.from_dict(outputdict) df['date'] = list_date df['name'] = df.date.apply(lambda x: 'NaN') df['content'] = df.uri main_df = main_transfo_timestamp_10(df, 'Facebook', 'sticker comment') if ALL_INDEX: main_df['uri'] = df['uri'] main_df = main_df[[ 'date', 'type', 'label', 'uri', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df['name'] main_df['content'] = df['content'] main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def read(self): ''' This function return a dataframe with all relevant informations about Facebook comments: 1) list all main dictionaries from the 'brut' dataframe and keep only raws where there is no attachment ( ie without path to a picture) and with 'data' ('data' contains timestamp, the comment and the author (and for a few of them the group where the comment was written)) 2) a list containing all 'data' (dictionary of dictionaries) is created 3) from these nested dictionaries the sub-dictionaries are extracted and 'merged' under a dictionary with three 'super keys' (timestamp, comment, author) --> nb: 'group' it's not relevant and 'can't be transform into a dataframe' 4) Timestamp is transformed into a datetime(ns), type and label are added 5) If all_index = True, the comment is added in the dataframe ''' data = self.read_json_comments(self.path) df = pd.DataFrame(list(data['comments'])) df = df[pd.isnull(df['attachments'])] df = df[df['data'].notna()] list_data = [] for i in df.iterrows(): list_data.append(i[1][1]) outputdict = self.drop_nesteddictionary_fb_comment(list_data) df1 = pd.DataFrame.from_dict(outputdict) df1['date'] = df1['timestamp'].astype(int) main_df = main_transfo_timestamp_10(df1, 'Facebook', 'comment') if ALL_INDEX: main_df['comment'] = df1['comment'] main_df['author'] = df1['author'] main_df = main_df[[ 'date', 'type', 'label', 'author', 'comment', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df1.author main_df['content'] = df1.comment main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def read(self): ''' Same procedure as 'above' (posts) but for external content (url) post on facebook. Nb: it could be the same function as above but nested dictionaries are encoded in other way Idea: is it possible to create a 'big' function in order to destructured nested dictionary ''' df = pd.read_json(self.path, convert_dates=False, encoding='utf-8') df = df[df['attachments'].notna()] returned = lists_creator(df, 'external_context') list_date = returned[0] list_att = returned[1] outputdict = {} for lis in list_att: for dic in lis: for key, value in dic.items(): for list_dic in value: for k2, v2 in list_dic.items(): for k3, v3 in v2.items(): outputdict[k3] = outputdict.get(k3, []) + [v3] outputdict.pop('name', None) outputdict.pop('source', None) df1 = pd.DataFrame.from_dict(outputdict) df1['date'] = list_date df1['name'] = df1.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df1, 'Facebook', 'post external content') if ALL_INDEX: main_df['url'] = df1['url'] main_df = main_df[[ 'date', 'type', 'label', 'url', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df1.name main_df['content'] = df1.url main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def read(self): data = self.read_json_comments(self.path) df = pd.DataFrame(list(data['comments'])) df = df[df['attachments'].notna()] list_date = [] for i in df.iterrows(): list_date.append(i[1][0]) list_attachments = [] for i in df.iterrows(): list_attachments.append(i[1][3]) outputdict = {} for lis in list_attachments: for dic in lis: for key, value in dic.items(): if isinstance(value, list): value_dic = value[0] for k2, v2 in value_dic.items(): for k3, v3 in v2.items(): outputdict[k3] = outputdict.get(k3, []) + [v3] outputdict.pop('creation_timestamp',None) outputdict.pop('media_metadata',None) outputdict.pop('title',None) df = pd.DataFrame.from_dict(outputdict) df['date'] = list_date df['content'] = df.uri df['name'] = df.content.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'sticker comment') if ALL_INDEX: main_df['uri'] = df['uri'] main_df = main_df[['date','type','label','uri','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.content main_df = main_df[['date','type','label','name','content','Year','Month','Day','Hour']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) self.df = main_df
def read(self): df = pd.read_json(self.path, convert_dates=False, encoding='utf-8') df = df[df['attachments'].notna()] returned = lists_creator(df, 'place') list_date = returned[0] list_att = returned[1] outputdict = {} for lis in list_att: for lis_2 in lis: for key, value in lis_2.items(): for lis_3 in value: for k2, v2 in lis_3.items(): for k3, v3 in v2.items(): outputdict[k3] = outputdict.get(k3, []) + [v3] outputdict.pop('url', None) new_outputdict = {a: list(set(b)) for a, b in outputdict.items()} df1 = pd.DataFrame.from_dict(new_outputdict) df1['date'] = list_date df1['content'] = df1.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df1, 'Facebook', 'place') if ALL_INDEX: main_df['name'] = df1['name'] main_df = main_df[[ 'date', 'type', 'label', 'name', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df1.name main_df['content'] = df1.content main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def read(self): data = self.read_json_likes(self.path) df = pd.DataFrame(list(data['page_likes'])) df['date'] = df['timestamp'] df['content'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'like page') if ALL_INDEX: main_df['page'] = df['name'] main_df = main_df[[ 'date', 'type', 'label', 'page', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.content main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def read(self): ''' Return a dataframe prom the pictures post on facebook, with the path to the picture in the Facebook file and the IP address used when uploading the picture. Nothing revolutionary, a new kind of nested dictionaries with multiple layer f list inside(see outputdict). NB: the ip address was inside a dictionnary and is taken out and added as 'main' column ''' df = pd.read_json(self.path, convert_dates=False, encoding='utf-8') df = df[df['attachments'].notna()] returned = lists_creator(df, 'media') list_att = returned[1] outputdict = {} for i in list_att: for j in i: for key, value in j.items(): for k in value: for k2, v2 in k.items(): if k2 == 'media': for k3, v3 in v2.items(): outputdict[k3] = outputdict.get(k3, []) + [v3] outputdict.pop('description', None) dict_ip = {} for i in outputdict['media_metadata']: for key, value in i.items(): if isinstance(value, dict): for k2, v2 in value.items(): dict_ip[k2] = dict_ip.get(k2, []) + [v2] else: dict_ip[key] = dict_ip.get(key, []) + [value] dict_ip.pop('orientation', None) list_ip = list(dict_ip.values()) df1 = pd.DataFrame.from_dict(outputdict) df1['ip'] = list_ip[0] df1['date'] = df1['creation_timestamp'].astype(int) main_df = main_transfo_timestamp_10(df1, 'Facebook', 'post picture') if ALL_INDEX: main_df['uri'] = df1['uri'] main_df['ip'] = df1['ip'] main_df = main_df[[ 'date', 'type', 'label', 'uri', 'ip', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df1.ip main_df['content'] = df1.uri main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def proc_fb_others_post_where_everything_is_missing(self, path): ''' Return a dataframe of the row still untreated (ie without title and post): 1) list_timestamp_rebels collect timestamp of rows with 'attachments', some posts with post have also attachments inside 2) concat_all_for_a_list return all the timestamps 'already treated', main_list keeps only timestamp which haven't been treated yet 3) with all conditions implemented, nested dictionaries are 'denested' and an ordonated dictionary is created --> nb: due to only one observation, the dictionary may be subjected to problem for other users (may lead to some changes) 4) a dataframe is produced from the dictionary and treated as usual ''' data = pd.read_json(path, encoding = 'utf-8') df = pd.DataFrame(list(data["wall_posts_sent_to_you"])).T list_data = [] for i in df.iterrows(): list_data.append(i[1][0]) list_timestamp_rebels = [] for lis in list_data: if self.gen_3(lis): for key, value in lis.items(): if key == 'timestamp': list_timestamp_rebels.append(value) list_timestamp = self.concat_all_for_a_list(path) main_list = list(set(list_timestamp_rebels)-set(list_timestamp)) outputdict = {} for lis in list_data: if self.gen_3(lis): if self.gen_4(lis, main_list): for key, value in lis.items(): if isinstance (value, list): for lis_2 in value: for k2, v2 in lis_2.items(): if isinstance (v2, list): for lis_3 in v2: for k3, v3 in lis_3.items(): for k4, v4, in v3.items(): outputdict[k4] = outputdict.get(k4, []) + [v4] else: outputdict[k2] = outputdict.get(k2, []) + [v2] else: outputdict[key] = outputdict.get(key, []) + [value] df1 = pd.DataFrame.from_dict(outputdict) df1['date'] = df1['timestamp'] df1['name'] =df1.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df1, 'Facebook', 'others post') if ALL_INDEX: main_df['url'] = df1['url'] main_df = main_df[['date','type','label','url','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df1.name main_df['content'] = df1.url main_df = main_df[['date','type','label','name','content','Year','Month','Day']] main_df.sort_values(["date"],axis=0,ascending=True,inplace=True) return main_df
def read(self): ''' This function returns a dataframe containing the date of frienship and (if wanted) the name of the friend --> the same structure is used for all 'type of friend interactions' so this function can be used for all of them --> for doing so type_data has to be introduced (name of the columns read by read_json: 'friends','received_requests' 'rejected_requests','deleted_friends', 'sent_requests') this will also be the type of data in the main dataframe --> the 'simple' friends.json add the email address of some friends, can be added with friends_with_contacts ''' text = self.path data = pd.read_json(self.path, encoding='utf-8') if re.search('received_friend_requests.json', text): label_data = 'received_requests' elif re.search('rejected_friend_requests.json', text): label_data = 'rejected_requests' elif re.search('removed_friends', text): label_data = 'deleted_friends' elif re.search('sent_friend_requests.json', text): label_data = 'sent_requests' else: label_data = 'friends' df = pd.DataFrame(list(data[label_data])) df['date'] = df['timestamp'] main_df = main_transfo_timestamp_10(df, 'Facebook', label_data) if ALL_INDEX: main_df['name'] = df['name'] main_df = main_df[[ 'date', 'type', 'label', 'name', 'Year', 'Month', 'Day', 'Hour' ]] if FRIENDS_WITH_CONTACTS: df1 = df[df['contact_info'].notna()] main_df = main_transfo_timestamp_10(df1, 'Facebook', 'friend') main_df['name'] = df1['name'] main_df['email'] = df1['contact_info'] main_df = main_df[[ 'date', 'type', 'label', 'name', 'email', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL_FRIENDS: main_df['name'] = df.name main_df['content'] = df.contact_info main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL_ELSE: main_df['name'] = df.name main_df['content'] = df.name.apply(lambda x: 'NaN') main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df
def data_fb_pictures_sent_by_msg(self, path): ''' This function permit to refind uris of pictures, videos and files exchanged by msg. --> we have to check if in the json file there is such type of files, we use dictionary's key to do this work --> if there is an existence all useful informations are transformed into a dataframe --> if multiple kind of medias had been exchanged in a conversation, we concatenated them (it's why we have to check if a dataframe of a certain type exists or not) --> dataframe is cleaned --> if a dataframe exist it is returned, else the function returns 0 ''' with open(path, encoding='utf-8') as json_data: data = json.load(json_data) outputdict = self.get_adict(data) for key in outputdict: if key == 'photos': outputdict_pics = {} for i in outputdict['photos']: for dic in i: for key, value in dic.items(): outputdict_pics[key] = outputdict_pics.get( key, []) + [value] df = pd.DataFrame.from_dict(outputdict_pics) else: continue for key in outputdict: if key == 'files': outputdict_file = {} for i in outputdict['files']: for dic in i: for key, value in dic.items(): outputdict_file[key] = outputdict_file.get( key, []) + [value] df2 = pd.DataFrame.from_dict(outputdict_file) else: continue for key in outputdict: if key == 'videos': outputdict_video = {} for i in outputdict['videos']: for dic in i: for key, value in dic.items(): outputdict_video[key] = outputdict_video.get( key, []) + [value] outputdict_video.pop('thumbnail') df3 = pd.DataFrame.from_dict(outputdict_video) else: continue if 'df' in locals(): if 'df2' in locals(): df = pd.concat([df, df2]) if 'df3' in locals(): df = pd.concat([df, df3]) else: if 'df2' in locals(): df = df2 if 'df3' in locals(): df = pd.concat([df, df3]) elif 'df3' in locals(): df = df3 if 'df' in locals(): df['date'] = df['creation_timestamp'].astype(int) df['name'] = df.date.apply(lambda x: 'NaN') main_df = main_transfo_timestamp_10(df, 'Facebook', 'msg media') main_df = main_df[[ 'date', 'type', 'label', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_INDEX: main_df['uri'] = df['uri'] main_df = main_df[[ 'date', 'type', 'label', 'uri', 'Year', 'Month', 'Day', 'Hour' ]] if ALL_GENERAL: main_df['name'] = df.name main_df['content'] = df.uri main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) return main_df, 1 else: return 0, 0
def read(self): ''' Same process as most of function above but for items saled on Facebook. ''' df = pd.read_json(self.path, convert_dates=False, encoding='utf-8') df = df[df['attachments'].notna()] returned = lists_creator(df, 'for_sale_item') list_date = returned[0] list_att = returned[1] outputdict = {} for lis in list_att: for lis_2 in lis: for key, value in lis_2.items(): for lis_3 in value: for k2, v2 in lis_3.items(): for k3, v3 in v2.items(): if isinstance(v3, dict): for k4, v4 in v3.items(): if isinstance(v4, dict): for k5, v5 in v4.items(): outputdict[ k5] = outputdict.get( k5, []) + [v5] else: outputdict[k4] = outputdict.get( k4, []) + [v4] else: outputdict[k3] = outputdict.get(k3, []) + [v3] outputdict.pop('description', None) outputdict.pop('title') df1 = pd.DataFrame.from_dict(outputdict) df1['date'] = list_date main_df = main_transfo_timestamp_10(df1, 'Facebook', 'for sale item') if ALL_INDEX: main_df['price'] = df1['price'] main_df['seller'] = df1['price'] main_df['category'] = df1['category'] main_df['marketplace'] = df1['marketplace'] main_df['location'] = df1['name'] main_df['latitude'] = df1['latitude'] main_df['longitude'] = df1['longitude'] main_df['uri'] = df1['uri'] main_df['ip'] = df1['upload_ip'] main_df = main_df[['date','type','label','price','seller','category','marketplace',\ 'location','latitude','longitude','uri','ip','Year','Month','Day','Hour']] if ALL_GENERAL: main_df['name'] = df1.marketplace main_df['content'] = df1.uri main_df = main_df[[ 'date', 'type', 'label', 'name', 'content', 'Year', 'Month', 'Day', 'Hour' ]] main_df.sort_values(["date"], axis=0, ascending=True, inplace=True) self.df = main_df