def deserialize_entry(entry, get_raw_meta=True): flatten_result = {} sources = [] meters = [] metadata_flattened = {} for k, v in entry.items(): if k.startswith('f:s_'): sources.append(decode_unicode(k[4:])) elif k.startswith('f:r_metadata.'): qualifier = decode_unicode(k[len('f:r_metadata.'):]) metadata_flattened[qualifier] = load(v) elif k.startswith("f:m_"): meter = ([unquote(i) for i in k[4:].split(':')], load(v)) meters.append(meter) else: LOG.info("v=%s type(%s)"%(str(v), type(v))) if ':' in k[2:]: key = tuple([unquote(i) for i in k[2:].split(':')]) else: key = unquote(k[2:]) flatten_result[key] = load(v) if get_raw_meta: metadata = flatten_result.get('resource_metadata', {}) else: metadata = metadata_flattened return flatten_result, sources, meters, metadata
def get_sample(sample_filter, limit=None): if limit == 0: return meter_table = conn.table('meter') q, start, stop, columns = (hbase_util.make_sample_query_from_filter (sample_filter, require_meter=False)) LOG.info("Query Meter Table: %s %s %s %s" % (q,start, stop, columns)) gen = meter_table.scan(filter=q, row_start=start, row_stop=stop, limit=limit, columns=columns,include_timestamp=True) for ignored, meter in gen: print ignored, meter
def main(): client = TelegramClient('session_name', TELEGRAM_API_ID, TELEGRAM_API_HASH) client.connect() me = sign_in(client) data = list_dialogs(client) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = ALL_COLUMNS # import pdb; pdb.set_trace() df['platform'] = 'telegram' own_name = '{} {}'.format(me.first_name, me.last_name).strip() df['senderName'] = own_name log.info('Detecting languages...') df['language'] = 'unknown' utils.export_dataframe(df, 'telegram.pkl') log.info('Done.')
def main(): args = parse_arguments() own_name = args.own_name print('Parsing JSON file...') with open(args.file_path, encoding='utf-8') as f: archive = json.loads(f.read()) names = {} def id_to_name(id): if id in names: return names[id] else: return None def save_name_for_id(name, id): if not id in names: names[id] = name elif names[id] != name: print('Assuming', name, 'is', names[id]) data = [] conversation_with_id = '' conversationWithName = '' print('Extracting messages...') for conversation in archive["conversations"]: if "conversation" in conversation["conversation"]: for participant in conversation["conversation"]["conversation"][ "participant_data"]: if "fallback_name" in participant: save_name_for_id(participant["fallback_name"], participant["id"]["gaia_id"]) for event in conversation["events"]: timestamp = int(event["timestamp"]) if "chat_message" in event and "segment" in event["chat_message"][ "message_content"]: content = event["chat_message"]["message_content"] text = content["segment"][0]["text"] conversationId = event["conversation_id"]["id"] sender_id = event["sender_id"]["chat_id"] participants = conversation["conversation"]["conversation"][ "current_participant"] if len(participants) == 2: for participant in participants: if id_to_name(participant["gaia_id"]) != own_name: conversation_with_id = participant["gaia_id"] if id_to_name(sender_id) is not None or id_to_name( conversation_with_id) is not None: if id_to_name( sender_id ) != own_name and sender_id != conversation_with_id: # print idToName(senderId), 'in conversation with', idToName(conversationWithId), '!' print('Parsing error, is your ownId correct?') exit(0) # saves the message timestamp = timestamp / 1000000 data += [[ timestamp, conversationId, id_to_name(conversation_with_id), id_to_name(sender_id), text ]] else: # unknown sender print("No senderName for either senderId", sender_id, conversation_with_id) if len(data) >= args.max_exported_messages: break log.debug('{} messages parsed.'.format(len(data))) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = DATAFRAME_COLUMNS df['platform'] = 'hangouts' log.info('Detecting languages...') df['language'] = 'unknown' for name, group in df.groupby(df.conversationWithName): sample = '' df2 = df[df.conversationWithName == name].dropna() if len(df2) > 10: for x in range(0, min(len(df2), 100)): sample = sample + df2.iloc[randint(0, len(df2) - 1)]['text'] print('\t', name, detect(sample)) df.loc[df.conversationWithName == name, 'language'] = detect(sample) log.info('Computing dates...') df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal) print(df.head()) utils.export_dataframe(df, 'hangouts.pkl') log.info('Done.')
#!/bin/python import happybase import hbase_util from __init__ import log as LOG from __init__ import SampleFilter conn = happybase.Connection(host='180.97.185.116',port=9090, compat='0.94', transport='framed') #table = conn.table('test2') #row = table.row('jacky20151204',include_timestamp=True) #print row LOG.info("test log!!!") def get_sample(sample_filter, limit=None): if limit == 0: return meter_table = conn.table('meter') q, start, stop, columns = (hbase_util.make_sample_query_from_filter (sample_filter, require_meter=False)) LOG.info("Query Meter Table: %s %s %s %s" % (q,start, stop, columns)) gen = meter_table.scan(filter=q, row_start=start, row_stop=stop, limit=limit, columns=columns,include_timestamp=True) for ignored, meter in gen: print ignored, meter #d_meter = hbase_util.deserialize_entry(meter)[0] #print d_meter #meter_table.delete('row-key',columns=['f:timestamp','f:rts'], timestamp=int) if __name__=='__main__': Data = SampleFilter(end_timestamp="2015-12-06 06:15:00",end_timestamp_op='lt',
def main(): args = parse_arguments() data = [] # make sure we don't crash if chat logs contain exotic characters for root, dirs, files in os.walk(args.file_path): for filename in files: if not filename.endswith('.json'): continue conversation_id = root.split('/')[-1] conversation_with_name = None document = os.path.join(root, filename) with open(document) as f: json_data = json.load(f) if "messages" not in json_data or "participants" not in json_data: print( "Missing messages or participant list in conversation {}" .format(conversation_id)) continue participants = json_data["participants"] if len(participants) < 2: print( "User with id {} left Facebook, we don't know what their name was." .format(conversation_id)) if len(participants) > 2: # TODO handle group chats continue for participant in participants: if participant['name'] != args.own_name: conversation_with_name = participant['name'] if conversation_with_name is None: conversation_with_name = conversation_id for message in json_data["messages"]: timestamp = message["timestamp_ms"] if "content" in message and "sender_name" in message: content = message["content"] if "sender_name" in message: sender_name = message["sender_name"] else: sender_name = conversation_id data += [[ timestamp, conversation_id, conversation_with_name, sender_name, content ]] print(len(data), 'messages parsed.') if len(data) < 1: print('Nothing to save.') exit(0) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = DATAFRAME_COLUMNS df['platform'] = 'messenger' log.info('Detecting languages...') df['language'] = 'unknown' log.info('Computing dates...') df['datetime'] = df['timestamp'].apply(lambda x: x / 1000).apply( utils.timestamp_to_ordinal) print(df.head()) utils.export_dataframe(df, 'messenger.pkl') log.info('Done.')