コード例 #1
0
import sys, os
import pandas as pd
from gchat_eml import Gchat, locate_eml

work_dir = sys.argv[1]
corpus_folder = sys.argv[2]

files = locate_eml(work_dir)

if corpus_folder[-1] != '/':
	corpus_folder += '/'
print corpus_folder

l = len(files) / 5
for (ii, f) in enumerate(files):
	try:
		gc = Gchat(f)
		gc.corpus_writer_yearly(corpus_folder + gc.msg_from_address)
	except:
		print 'uwotmate'
		pass
	if ii % l == 0 and ii != 0:
		print ('%d percent finished..' % ((ii // l) * 20) )
コード例 #2
0
import os, re, sys
import MySQLdb, getpass
from metadata_parser import Metadata
from gchat_eml import locate_eml

# parse the parent directory where all the subchats-N directories are stored
# using gmvault, this is usually /gmvault-db/db/chats/
work_dir = sys.argv[1]
files = locate_eml(work_dir, 'meta')
print "%d metadata files located" %(len(files))

# specify the output file location and name
try:
	output_file = sys.argv[2]
except:
	output_file = '/private/tmp/gchat_metadata_store.csv'

f = open(output_file,'w')
f.write('%s\n' %'timestamp, gm_id, subject, msg_id, thread_ids, x_gmail_received')
f.close()

l = len(files)/5
corrupted = 0

for (ii, metadata_file) in enumerate(files):
	# line = open(work_dir + metadata_file, 'r').next()
	line = open(metadata_file, 'r').next()
	m = Metadata(line)
	a = m.write_file(output_file)
	if a:
		print 'file was corrupted'