Beispiel #1
0
def main():
    client = TelegramClient('session_name', config.TELEGRAM_API_ID,
                            config.TELEGRAM_API_HASH)
    client.connect()
    me = sign_in(client)
    data = list_dialogs(client)
    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = config.ALL_COLUMNS
    # import pdb; pdb.set_trace()
    df['platform'] = 'telegram'
    own_name = '{} {}'.format(me.first_name, me.last_name).strip()
    df['senderName'] = own_name

    log.info('Detecting languages...')
    df['language'] = 'unknown'

    utils.export_dataframe(df, 'telegram.pkl')
    log.info('Done.')
Beispiel #2
0
def main():
    args = parse_arguments()
    own_name = args.own_name

    print('Parsing JSON file...')
    with open(args.file_path) as f:
        archive = json.loads(f.read())

    names = {}

    def idToName(id):
        if id in names:
            return names[id]
        else:
            return None

    def saveNameForId(name, id):
        if not id in names:
            names[id] = name
        elif names[id] != name:
            print('Assuming', name, 'is', names[id])

    data = []
    conversationWithId = ''
    conversationWithName = ''

    print('Extracting messages...')
    for state in archive["conversation_state"]:
        if "conversation" in state["conversation_state"]:
            for participant in state["conversation_state"]["conversation"][
                    "participant_data"]:
                if "fallback_name" in participant:
                    saveNameForId(participant["fallback_name"],
                                  participant["id"]["gaia_id"])

        for event in state["conversation_state"]["event"]:
            timestamp = int(event["timestamp"])

            if "chat_message" in event and "segment" in event["chat_message"][
                    "message_content"]:
                content = event["chat_message"]["message_content"]
                text = content["segment"][0]["text"]
                conversationId = event["conversation_id"]["id"]
                senderId = event["sender_id"]["chat_id"]

                participants = state["conversation_state"]["conversation"][
                    "current_participant"]

                if len(participants) == 2:
                    for participant in participants:
                        if idToName(participant["gaia_id"]) != own_name:
                            conversationWithId = participant["gaia_id"]

                    if idToName(senderId) is not None or idToName(
                            conversationWithId) is not None:
                        if idToName(
                                senderId
                        ) != own_name and senderId != conversationWithId:
                            # print idToName(senderId), 'in conversation with', idToName(conversationWithId), '!'
                            print('Parsing error, is your ownId correct?')
                            exit(0)

                        # saves the message
                        timestamp = timestamp / 1000000
                        data += [[
                            timestamp, conversationId,
                            idToName(conversationWithId),
                            idToName(senderId), text
                        ]]

                    else:
                        # unknown sender
                        print("No senderName for either senderId", senderId,
                              conversationWithId)

                    if len(data) >= args.max_exported_messages:
                        break

    log.debug(len(data), 'messages parsed.')

    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = config.DATAFRAME_COLUMNS
    df['platform'] = 'hangouts'

    log.info('Detecting languages...')
    df['language'] = 'unknown'
    for name, group in df.groupby(df.conversationWithName):
        sample = ''
        df2 = df[df.conversationWithName == name].dropna()

        if len(df2) > 10:
            for x in range(0, min(len(df2), 100)):
                sample = sample + df2.iloc[randint(0, len(df2) - 1)]['text']

            print('\t', name, detect(sample))
            df.loc[df.conversationWithName == name,
                   'language'] = detect(sample)

    log.info('Computing dates...')
    df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal)

    print(df.head())
    utils.export_dataframe(df, 'hangouts.pkl')
    log.info('Done.')
Beispiel #3
0
def main():
    args = parse_arguments()

    data = []

    # make sure we don't crash if chat logs contain exotic characters
    for root, dirs, files in os.walk(args.file_path):
        for filename in files:
            if not filename.endswith('.json'):
                continue

            conversation_id = root.split('/')[-1]
            conversation_with_name = None

            document = os.path.join(root, filename)
            with open(document) as f:
                json_data = json.load(f)

                if "messages" not in json_data or "participants" not in json_data:
                    print("Missing messages or participant list in conversation {}".format(conversation_id))
                    continue

                participants = json_data["participants"]

                if len(participants) < 2:
                    print("User with id {} left Facebook, we don't know what their name was.".format(conversation_id))

                if len(participants) > 2:
                    # TODO handle group chats
                    continue

                for participant in participants:
                    if participant['name'] != args.own_name:
                        conversation_with_name = participant['name']

                if conversation_with_name is None: conversation_with_name = conversation_id

                for message in json_data["messages"]:
                    timestamp = message["timestamp_ms"]
                    if "content" in message and "sender_name" in message:
                        content = message["content"]

                        if "sender_name" in message:
                            sender_name = message["sender_name"]
                        else:
                            sender_name = conversation_id

                        data += [[timestamp, conversation_id, conversation_with_name, sender_name, content]]

    print(len(data), 'messages parsed.')

    if len(data) < 1:
        print('Nothing to save.')
        exit(0)

    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = config.DATAFRAME_COLUMNS
    df['platform'] = 'messenger'

    log.info('Detecting languages...')
    df['language'] = 'unknown'

    log.info('Computing dates...')
    df['datetime'] = df['timestamp'].apply(lambda x: x / 1000).apply(utils.timestamp_to_ordinal)

    print(df.head())
    utils.export_dataframe(df, 'messenger.pkl')
    log.info('Done.')
Beispiel #4
0
def main():
    args = parse_arguments()

    fallbackDateParsing = False
    data = []
    warnedNameChanges = []
    nbInvalidSender = 0

    # make sure we don't crash if chat logs contain exotic characters
    etree.set_default_parser(
        etree.XMLParser(encoding='utf-8', ns_clean=True, recover=True))

    for filename in os.listdir(args.file_path):

        if not filename.endswith('.html'):
            continue

        document = os.path.join(args.file_path, filename)
        archive = etree.parse(document)

        conversationId = filename.replace('.html', '')
        groupConversation = False
        timestamp = ''
        senderName = ''
        conversationWithName = None

        for element in archive.iter():
            tag = element.tag
            className = element.get('class')
            content = element.text

            if tag == 'p':
                text = content

                if conversationWithName != '' and senderName != '':

                    # handles when the interlocutor's name changed at some point
                    if (senderName != conversationWithName) and (senderName != args.own_name) and \
                            (senderName not in warnedNameChanges) and (not groupConversation):
                        if senderName not in warnedNameChanges:
                            print('\t', 'Assuming', senderName, 'is',
                                  conversationWithName)
                            warnedNameChanges.append(senderName)

                        senderName = conversationWithName

                    data += [[
                        timestamp, conversationId, conversationWithName,
                        senderName, text
                    ]]

                else:
                    nbInvalidSender = nbInvalidSender + 1

            elif tag == 'span':
                if className == 'user':
                    senderName = content
                elif className == 'meta':
                    try:
                        if not fallbackDateParsing:
                            timestamp = time.mktime(
                                pd.to_datetime(
                                    content,
                                    format='%A, %B %d, %Y at %H:%M%p',
                                    exact=False).timetuple())
                        else:
                            timestamp = time.mktime(
                                pd.to_datetime(
                                    content,
                                    infer_datetime_format=True).timetuple())

                    except ValueError:
                        if not fallbackDateParsing:
                            print(
                                'Unexpected date format. '
                                'Falling back to infer_datetime_format, parsing will be slower.'
                            )
                            timestamp = time.mktime(
                                pd.to_datetime(
                                    content,
                                    format='%A, %B %d, %Y at %H:%M%p',
                                    exact=False).timetuple())
                            fallbackDateParsing = True
                        else:
                            raise

            elif tag == 'div' and className == 'thread':
                nbParticipants = str(element.xpath("text()")).count(', ') + 1
                if nbParticipants > 1:
                    groupConversation = True

            elif tag == 'h3':
                if conversationWithName is not None:
                    print(
                        'Something is wrong. File format changed? (multiple conversation hearder in a single file)'
                    )
                    exit(0)
                else:
                    content = content.replace('Conversation with ', '')
                    conversationWithName = content

                print(conversationId, conversationWithName, "(group?",
                      groupConversation, ")")

            if len(data) >= args.max_exported_messages:
                break

    print(len(data), 'messages parsed.')

    if nbInvalidSender > 0:
        print(nbInvalidSender, 'messages discarded because of bad ID.')

    if len(data) < 1:
        print('Nothing to save.')
        exit(0)

    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = config.DATAFRAME_COLUMNS
    df['platform'] = 'messenger'

    log.info('Detecting languages...')
    df['language'] = 'unknown'
    for name, group in df.groupby(df.conversationWithName):
        sample = ''
        df2 = df[df.conversationWithName == name].dropna()

        if len(df2) > 10:
            for x in range(0, min(len(df2), 100)):
                sample = sample + df2.iloc[random.randint(
                    0,
                    len(df2) - 1)]['text']

            print('\t', name, detect(sample), "(", len(df2), "msgs)")
            df.loc[df.conversationWithName == name,
                   'language'] = detect(sample)

    log.info('Computing dates...')
    df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal)

    print(df.head())
    utils.export_dataframe(df, 'messenger.pkl')
    log.info('Done.')