Example #1
0
def deserialize_entry(entry, get_raw_meta=True):
    flatten_result = {}
    sources = []
    meters = []
    metadata_flattened = {}
    for k, v in entry.items():
        if k.startswith('f:s_'):
            sources.append(decode_unicode(k[4:]))
        elif k.startswith('f:r_metadata.'):
            qualifier = decode_unicode(k[len('f:r_metadata.'):])
            metadata_flattened[qualifier] = load(v)
        elif k.startswith("f:m_"):
            meter = ([unquote(i) for i in k[4:].split(':')], load(v))
            meters.append(meter)
        else:
            LOG.info("v=%s type(%s)"%(str(v), type(v)))
            if ':' in k[2:]:
                key = tuple([unquote(i) for i in k[2:].split(':')])
            else:
                key = unquote(k[2:])
            flatten_result[key] = load(v)
    if get_raw_meta:
        metadata = flatten_result.get('resource_metadata', {})
    else:
        metadata = metadata_flattened

    return flatten_result, sources, meters, metadata
Example #2
0
def get_sample(sample_filter, limit=None):
    if limit == 0:
    	return
    meter_table = conn.table('meter')
    q, start, stop, columns = (hbase_util.make_sample_query_from_filter
								(sample_filter, require_meter=False))
    LOG.info("Query Meter Table: %s %s %s %s" % (q,start, stop, columns))
    gen = meter_table.scan(filter=q, row_start=start, row_stop=stop,
						limit=limit, columns=columns,include_timestamp=True)
    for ignored, meter in gen:
        print ignored, meter
Example #3
0
def main():
    client = TelegramClient('session_name', TELEGRAM_API_ID, TELEGRAM_API_HASH)
    client.connect()
    me = sign_in(client)
    data = list_dialogs(client)
    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = ALL_COLUMNS
    # import pdb; pdb.set_trace()
    df['platform'] = 'telegram'
    own_name = '{} {}'.format(me.first_name, me.last_name).strip()
    df['senderName'] = own_name

    log.info('Detecting languages...')
    df['language'] = 'unknown'

    utils.export_dataframe(df, 'telegram.pkl')
    log.info('Done.')
Example #4
0
def main():
    args = parse_arguments()
    own_name = args.own_name

    print('Parsing JSON file...')
    with open(args.file_path, encoding='utf-8') as f:
        archive = json.loads(f.read())

    names = {}

    def id_to_name(id):
        if id in names:
            return names[id]
        else:
            return None

    def save_name_for_id(name, id):
        if not id in names:
            names[id] = name
        elif names[id] != name:
            print('Assuming', name, 'is', names[id])

    data = []
    conversation_with_id = ''
    conversationWithName = ''

    print('Extracting messages...')
    for conversation in archive["conversations"]:
        if "conversation" in conversation["conversation"]:
            for participant in conversation["conversation"]["conversation"][
                    "participant_data"]:
                if "fallback_name" in participant:
                    save_name_for_id(participant["fallback_name"],
                                     participant["id"]["gaia_id"])

        for event in conversation["events"]:
            timestamp = int(event["timestamp"])

            if "chat_message" in event and "segment" in event["chat_message"][
                    "message_content"]:
                content = event["chat_message"]["message_content"]
                text = content["segment"][0]["text"]
                conversationId = event["conversation_id"]["id"]
                sender_id = event["sender_id"]["chat_id"]

                participants = conversation["conversation"]["conversation"][
                    "current_participant"]

                if len(participants) == 2:
                    for participant in participants:
                        if id_to_name(participant["gaia_id"]) != own_name:
                            conversation_with_id = participant["gaia_id"]

                    if id_to_name(sender_id) is not None or id_to_name(
                            conversation_with_id) is not None:
                        if id_to_name(
                                sender_id
                        ) != own_name and sender_id != conversation_with_id:
                            # print idToName(senderId), 'in conversation with', idToName(conversationWithId), '!'
                            print('Parsing error, is your ownId correct?')
                            exit(0)

                        # saves the message
                        timestamp = timestamp / 1000000
                        data += [[
                            timestamp, conversationId,
                            id_to_name(conversation_with_id),
                            id_to_name(sender_id), text
                        ]]

                    else:
                        # unknown sender
                        print("No senderName for either senderId", sender_id,
                              conversation_with_id)

                    if len(data) >= args.max_exported_messages:
                        break

    log.debug('{} messages parsed.'.format(len(data)))

    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = DATAFRAME_COLUMNS
    df['platform'] = 'hangouts'

    log.info('Detecting languages...')
    df['language'] = 'unknown'
    for name, group in df.groupby(df.conversationWithName):
        sample = ''
        df2 = df[df.conversationWithName == name].dropna()

        if len(df2) > 10:
            for x in range(0, min(len(df2), 100)):
                sample = sample + df2.iloc[randint(0, len(df2) - 1)]['text']

            print('\t', name, detect(sample))
            df.loc[df.conversationWithName == name,
                   'language'] = detect(sample)

    log.info('Computing dates...')
    df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal)

    print(df.head())
    utils.export_dataframe(df, 'hangouts.pkl')
    log.info('Done.')
Example #5
0
#!/bin/python
import happybase
import hbase_util
from __init__ import log as LOG
from __init__ import SampleFilter

conn = happybase.Connection(host='180.97.185.116',port=9090, compat='0.94', transport='framed')

#table = conn.table('test2')

#row = table.row('jacky20151204',include_timestamp=True)
#print row
LOG.info("test log!!!")

def get_sample(sample_filter, limit=None):
    if limit == 0:
    	return
    meter_table = conn.table('meter')
    q, start, stop, columns = (hbase_util.make_sample_query_from_filter
								(sample_filter, require_meter=False))
    LOG.info("Query Meter Table: %s %s %s %s" % (q,start, stop, columns))
    gen = meter_table.scan(filter=q, row_start=start, row_stop=stop,
						limit=limit, columns=columns,include_timestamp=True)
    for ignored, meter in gen:
        print ignored, meter
        #d_meter = hbase_util.deserialize_entry(meter)[0]
        #print d_meter
    #meter_table.delete('row-key',columns=['f:timestamp','f:rts'], timestamp=int)

if __name__=='__main__':
	Data = SampleFilter(end_timestamp="2015-12-06 06:15:00",end_timestamp_op='lt',
Example #6
0
def main():
    args = parse_arguments()

    data = []

    # make sure we don't crash if chat logs contain exotic characters
    for root, dirs, files in os.walk(args.file_path):
        for filename in files:
            if not filename.endswith('.json'):
                continue

            conversation_id = root.split('/')[-1]
            conversation_with_name = None

            document = os.path.join(root, filename)
            with open(document) as f:
                json_data = json.load(f)

                if "messages" not in json_data or "participants" not in json_data:
                    print(
                        "Missing messages or participant list in conversation {}"
                        .format(conversation_id))
                    continue

                participants = json_data["participants"]

                if len(participants) < 2:
                    print(
                        "User with id {} left Facebook, we don't know what their name was."
                        .format(conversation_id))

                if len(participants) > 2:
                    # TODO handle group chats
                    continue

                for participant in participants:
                    if participant['name'] != args.own_name:
                        conversation_with_name = participant['name']

                if conversation_with_name is None:
                    conversation_with_name = conversation_id

                for message in json_data["messages"]:
                    timestamp = message["timestamp_ms"]
                    if "content" in message and "sender_name" in message:
                        content = message["content"]

                        if "sender_name" in message:
                            sender_name = message["sender_name"]
                        else:
                            sender_name = conversation_id

                        data += [[
                            timestamp, conversation_id, conversation_with_name,
                            sender_name, content
                        ]]

    print(len(data), 'messages parsed.')

    if len(data) < 1:
        print('Nothing to save.')
        exit(0)

    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = DATAFRAME_COLUMNS
    df['platform'] = 'messenger'

    log.info('Detecting languages...')
    df['language'] = 'unknown'

    log.info('Computing dates...')
    df['datetime'] = df['timestamp'].apply(lambda x: x / 1000).apply(
        utils.timestamp_to_ordinal)

    print(df.head())
    utils.export_dataframe(df, 'messenger.pkl')
    log.info('Done.')