def main(): client = TelegramClient('session_name', config.TELEGRAM_API_ID, config.TELEGRAM_API_HASH) client.connect() me = sign_in(client) data = list_dialogs(client) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = config.ALL_COLUMNS # import pdb; pdb.set_trace() df['platform'] = 'telegram' own_name = '{} {}'.format(me.first_name, me.last_name).strip() df['senderName'] = own_name log.info('Detecting languages...') df['language'] = 'unknown' utils.export_dataframe(df, 'telegram.pkl') log.info('Done.')
def main(): args = parse_arguments() own_name = args.own_name print('Parsing JSON file...') with open(args.file_path) as f: archive = json.loads(f.read()) names = {} def idToName(id): if id in names: return names[id] else: return None def saveNameForId(name, id): if not id in names: names[id] = name elif names[id] != name: print('Assuming', name, 'is', names[id]) data = [] conversationWithId = '' conversationWithName = '' print('Extracting messages...') for state in archive["conversation_state"]: if "conversation" in state["conversation_state"]: for participant in state["conversation_state"]["conversation"][ "participant_data"]: if "fallback_name" in participant: saveNameForId(participant["fallback_name"], participant["id"]["gaia_id"]) for event in state["conversation_state"]["event"]: timestamp = int(event["timestamp"]) if "chat_message" in event and "segment" in event["chat_message"][ "message_content"]: content = event["chat_message"]["message_content"] text = content["segment"][0]["text"] conversationId = event["conversation_id"]["id"] senderId = event["sender_id"]["chat_id"] participants = state["conversation_state"]["conversation"][ "current_participant"] if len(participants) == 2: for participant in participants: if idToName(participant["gaia_id"]) != own_name: conversationWithId = participant["gaia_id"] if idToName(senderId) is not None or idToName( conversationWithId) is not None: if idToName( senderId ) != own_name and senderId != conversationWithId: # print idToName(senderId), 'in conversation with', idToName(conversationWithId), '!' print('Parsing error, is your ownId correct?') exit(0) # saves the message timestamp = timestamp / 1000000 data += [[ timestamp, conversationId, idToName(conversationWithId), idToName(senderId), text ]] else: # unknown sender print("No senderName for either senderId", senderId, conversationWithId) if len(data) >= args.max_exported_messages: break log.debug(len(data), 'messages parsed.') log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = config.DATAFRAME_COLUMNS df['platform'] = 'hangouts' log.info('Detecting languages...') df['language'] = 'unknown' for name, group in df.groupby(df.conversationWithName): sample = '' df2 = df[df.conversationWithName == name].dropna() if len(df2) > 10: for x in range(0, min(len(df2), 100)): sample = sample + df2.iloc[randint(0, len(df2) - 1)]['text'] print('\t', name, detect(sample)) df.loc[df.conversationWithName == name, 'language'] = detect(sample) log.info('Computing dates...') df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal) print(df.head()) utils.export_dataframe(df, 'hangouts.pkl') log.info('Done.')
def main(): args = parse_arguments() data = [] # make sure we don't crash if chat logs contain exotic characters for root, dirs, files in os.walk(args.file_path): for filename in files: if not filename.endswith('.json'): continue conversation_id = root.split('/')[-1] conversation_with_name = None document = os.path.join(root, filename) with open(document) as f: json_data = json.load(f) if "messages" not in json_data or "participants" not in json_data: print("Missing messages or participant list in conversation {}".format(conversation_id)) continue participants = json_data["participants"] if len(participants) < 2: print("User with id {} left Facebook, we don't know what their name was.".format(conversation_id)) if len(participants) > 2: # TODO handle group chats continue for participant in participants: if participant['name'] != args.own_name: conversation_with_name = participant['name'] if conversation_with_name is None: conversation_with_name = conversation_id for message in json_data["messages"]: timestamp = message["timestamp_ms"] if "content" in message and "sender_name" in message: content = message["content"] if "sender_name" in message: sender_name = message["sender_name"] else: sender_name = conversation_id data += [[timestamp, conversation_id, conversation_with_name, sender_name, content]] print(len(data), 'messages parsed.') if len(data) < 1: print('Nothing to save.') exit(0) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = config.DATAFRAME_COLUMNS df['platform'] = 'messenger' log.info('Detecting languages...') df['language'] = 'unknown' log.info('Computing dates...') df['datetime'] = df['timestamp'].apply(lambda x: x / 1000).apply(utils.timestamp_to_ordinal) print(df.head()) utils.export_dataframe(df, 'messenger.pkl') log.info('Done.')
def main(): args = parse_arguments() fallbackDateParsing = False data = [] warnedNameChanges = [] nbInvalidSender = 0 # make sure we don't crash if chat logs contain exotic characters etree.set_default_parser( etree.XMLParser(encoding='utf-8', ns_clean=True, recover=True)) for filename in os.listdir(args.file_path): if not filename.endswith('.html'): continue document = os.path.join(args.file_path, filename) archive = etree.parse(document) conversationId = filename.replace('.html', '') groupConversation = False timestamp = '' senderName = '' conversationWithName = None for element in archive.iter(): tag = element.tag className = element.get('class') content = element.text if tag == 'p': text = content if conversationWithName != '' and senderName != '': # handles when the interlocutor's name changed at some point if (senderName != conversationWithName) and (senderName != args.own_name) and \ (senderName not in warnedNameChanges) and (not groupConversation): if senderName not in warnedNameChanges: print('\t', 'Assuming', senderName, 'is', conversationWithName) warnedNameChanges.append(senderName) senderName = conversationWithName data += [[ timestamp, conversationId, conversationWithName, senderName, text ]] else: nbInvalidSender = nbInvalidSender + 1 elif tag == 'span': if className == 'user': senderName = content elif className == 'meta': try: if not fallbackDateParsing: timestamp = time.mktime( pd.to_datetime( content, format='%A, %B %d, %Y at %H:%M%p', exact=False).timetuple()) else: timestamp = time.mktime( pd.to_datetime( content, infer_datetime_format=True).timetuple()) except ValueError: if not fallbackDateParsing: print( 'Unexpected date format. ' 'Falling back to infer_datetime_format, parsing will be slower.' ) timestamp = time.mktime( pd.to_datetime( content, format='%A, %B %d, %Y at %H:%M%p', exact=False).timetuple()) fallbackDateParsing = True else: raise elif tag == 'div' and className == 'thread': nbParticipants = str(element.xpath("text()")).count(', ') + 1 if nbParticipants > 1: groupConversation = True elif tag == 'h3': if conversationWithName is not None: print( 'Something is wrong. File format changed? (multiple conversation hearder in a single file)' ) exit(0) else: content = content.replace('Conversation with ', '') conversationWithName = content print(conversationId, conversationWithName, "(group?", groupConversation, ")") if len(data) >= args.max_exported_messages: break print(len(data), 'messages parsed.') if nbInvalidSender > 0: print(nbInvalidSender, 'messages discarded because of bad ID.') if len(data) < 1: print('Nothing to save.') exit(0) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = config.DATAFRAME_COLUMNS df['platform'] = 'messenger' log.info('Detecting languages...') df['language'] = 'unknown' for name, group in df.groupby(df.conversationWithName): sample = '' df2 = df[df.conversationWithName == name].dropna() if len(df2) > 10: for x in range(0, min(len(df2), 100)): sample = sample + df2.iloc[random.randint( 0, len(df2) - 1)]['text'] print('\t', name, detect(sample), "(", len(df2), "msgs)") df.loc[df.conversationWithName == name, 'language'] = detect(sample) log.info('Computing dates...') df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal) print(df.head()) utils.export_dataframe(df, 'messenger.pkl') log.info('Done.')