def main(): topics=importTopics('corpus_min1_stopwdsTrue_all_tfidf_lsi_topics.pkl') #topics=importTopics('test_corpus_lsi_topics.pkl') #print topics[0][0] con, cur=enron.connectDB("enron") cur.execute("select id from emails order by id desc limit 1;") res = cur.fetchall() tmp = [int(col) for row in res for col in row] size=tmp[0] #pofD=1./float(size) #pofT=1./10. tot=0 for id in range(1,size): cur.execute(" select text from emails where id = {0} ".format(id)) tmp = cur.fetchall() text = enron.cleanString(enron.stripCharacters(tmp[0][0])) text_stem = stem.stemmingString(text, id, stopwords=True) #topicprob=pofTgivenD(text_stem,topics)*pofD/pofT topicprob=pofTgivenD(text_stem,topics) tot+=topicprob if topicprob>1.: print "ERROR: PROBABILITY LARGER THAN 1",id, topicprob if id % 1000 == 0: print "Email {0} processed, probability sum: {1}".format(id,tot) #print "Probability of generating email {0} from this topic set: {1}".format(id,topicprob) con.close() print "Final sum of probabilities:",tot
def sendersAndReceivers(sender): print 'Querying database' con,cur = enron.connectDB('enron') query = """select `sender`,`to`,`cc`,`id` from emails where `sender` like '%{0}%' or `to` like '%{0}%' or `cc` like '%{0}%' limit 1000;""".format(sender) print query cur.execute(query) sendandrec = cur.fetchall() con.close() return sendandrec
def sendersAndReceivers(sender): print 'Querying database' con, cur = enron.connectDB('enron') query = """select `sender`,`to`,`cc`,`id` from emails where `sender` like '%{0}%' or `to` like '%{0}%' or `cc` like '%{0}%' limit 1000;""".format(sender) print query cur.execute(query) sendandrec = cur.fetchall() con.close() return sendandrec
def indexLinks(): con, cur=enron.connectDB("enron") cur.execute("select distinct `sender` from `emails`") tmp=cur.fetchall() tmp=[x[0].strip() for x in tmp] tmp_unique = list(set(tmp)) cur.execute("select distinct `to` from `emails`") tmp1=cur.fetchall() print tmp1[0] #tmp1=[element.split(',') for element[0] in tmp1] #tmp1=(element for str(element).split(', ') in tmp1) tmp2 = [x[0].strip().split(',') for x in tmp1] tmp2 = [item.strip() for sublist in tmp2 for item in sublist] tmp2_unique = list(set(tmp2)) cur.execute("select distinct `cc` from `emails`") tmp3=cur.fetchall() tmp4 = [x[0].strip().split(',') for x in tmp3] tmp4 = [item.strip() for sublist in tmp4 for item in sublist] tmp4_unique = list(set(tmp4)) all_addresses = tmp_unique + tmp2_unique + tmp4_unique all_addresses_unique = list(set(all_addresses)) all_addresses_unique.sort() con.close() address_file = open('addresses_all.txt', 'w') for idx,email in enumerate(all_addresses_unique): address_file.write("""{0}\t"{1}"\n""".format(idx,email)) address_file.close() return list(enumerate(all_addresses_unique))
def indexLinks(): con, cur = enron.connectDB("enron") cur.execute("select distinct `sender` from `emails`") tmp = cur.fetchall() tmp = [x[0].strip() for x in tmp] tmp_unique = list(set(tmp)) cur.execute("select distinct `to` from `emails`") tmp1 = cur.fetchall() print tmp1[0] #tmp1=[element.split(',') for element[0] in tmp1] #tmp1=(element for str(element).split(', ') in tmp1) tmp2 = [x[0].strip().split(',') for x in tmp1] tmp2 = [item.strip() for sublist in tmp2 for item in sublist] tmp2_unique = list(set(tmp2)) cur.execute("select distinct `cc` from `emails`") tmp3 = cur.fetchall() tmp4 = [x[0].strip().split(',') for x in tmp3] tmp4 = [item.strip() for sublist in tmp4 for item in sublist] tmp4_unique = list(set(tmp4)) all_addresses = tmp_unique + tmp2_unique + tmp4_unique all_addresses_unique = list(set(all_addresses)) all_addresses_unique.sort() con.close() address_file = open('addresses_all.txt', 'w') for idx, email in enumerate(all_addresses_unique): address_file.write("""{0}\t"{1}"\n""".format(idx, email)) address_file.close() return list(enumerate(all_addresses_unique))
def main(): args = parser.parse_args() print args connection, cursor = enron.connectDB(args.name) cursor.execute("select ID from emails order by id desc limit 1;") #cursor.execute("select ID from {0} order by id desc limit 1;".format(args.table)) numrows = int(cursor.fetchone()[0]) #loop over number of rows #this is less efficient than operating on the database as a whole #but it won't make your computer slow down and explode for id in range(1, numrows+1): #fetch the rawtext cursor.execute("select rawtext from emails where id = {0}".format(id)) rawtext = cursor.fetchone()[0] cleantext = ultraClean(rawtext) cleantext_escape = mdb.escape_string(cleantext) query = """UPDATE emails set {0}='{1}' where `id` = {2};""".format(args.column, cleantext_escape, id) #query = """UPDATE {0} set {1}='{2}' where `id` = {3};""".format(args.table, args.column, cleantext_escape, id) cursor.execute(query) connection.commit() print 'Updated entry {0}'.format(id) connection.close()
def main(): args = parser.parse_args() print args connection, cursor = enron.connectDB(args.name) cursor.execute("select ID from emails order by id desc limit 1;") #cursor.execute("select ID from {0} order by id desc limit 1;".format(args.table)) numrows = int(cursor.fetchone()[0]) #loop over number of rows #this is less efficient than operating on the database as a whole #but it won't make your computer slow down and explode for id in range(1, numrows + 1): #fetch the rawtext cursor.execute("select rawtext from emails where id = {0}".format(id)) rawtext = cursor.fetchone()[0] cleantext = ultraClean(rawtext) cleantext_escape = mdb.escape_string(cleantext) query = """UPDATE emails set {0}='{1}' where `id` = {2};""".format( args.column, cleantext_escape, id) #query = """UPDATE {0} set {1}='{2}' where `id` = {3};""".format(args.table, args.column, cleantext_escape, id) cursor.execute(query) connection.commit() print 'Updated entry {0}'.format(id) connection.close()
def main(): args = parser.parse_args() #First thing: create the DB # Check that the DB doesn't already exist, and if it does, delete it. Comment this later, I just inserted this for now in case more than one attempt to create the DB was required. #con = mdb.connect('localhost', 'kpmg1', 's2ds') #cur=con.cursor() #deleteDB(cur,'enron') #con.close() createDB() connection, cursor = enron.connectDB('enron') #start directory is unique for each person startdir = args.startdir hashlist=[] duplicate_log = open('duplicate_log.txt', 'w') filecount = 0 duplicate_count = 0 print 'Walking the directory tree (this takes a while)....' for dir,subdir,files in os.walk(startdir): for ff in files: filepath = os.path.join(dir,ff) #calculate hash with open(filepath, 'r') as efile: msglines = efile.readlines() msg2 = [x for x in msglines if not x.startswith('Message-ID') and not x.startswith('X-Folder')] msg2 = ''.join(msg2) m = hashlib.md5() m.update(msg2) if m.hexdigest() not in hashlist: hashlist.append(m.hexdigest()) msg = email.message_from_string(''.join(msglines)) addDBEntry(connection,cursor, 'emails', msg, filepath) filecount+=1 else: 'Duplicate message found {0}'.format(filepath) duplicate_log.write(m.hexdigest()+'\t'+filepath+'\n') duplicate_count+=1 connection.close() duplicate_log.close() print '{0} entries added to the database'.format(filecount) print '{0} files discounted as duplicates'.format(duplicate_count)
def main(): args = parser.parse_args() rootdir=os.getcwd() foldername=args.directory folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==True): shutil.rmtree(folderpath) os.makedirs(folderpath) else: os.makedirs(folderpath) stop_words = enron.getCustomStopwords() timinglog = open(os.path.join(folderpath,args.output_timelog), 'w') timinglog.write('#Tokeniser Stemmer/Lemmatiser Codetime Writetime\n') # NB if you make changes here also do it below for the args/kwargs token_command = [ ["nltk", "f = p.tokenize.WordPunctTokenizer()", "tokenize"], ["nltk", "f = p.tokenize.PunktWordTokenizer()", "tokenize"], ["gensim", "f = p.utils", "tokenize"] ] stem_command = [ ["nltk", "g = q.stem.snowball.EnglishStemmer()", "stem"], ["nltk", "g = q.stem.snowball.PorterStemmer()", "stem"], ["nltk", "g = q.stem.lancaster.LancasterStemmer()", "stem"], ["nltk", "g = q.stem.WordNetLemmatizer()", "lemmatize"], ["gensim", "g = q.utils", "lemmatize"] ] #Either get text as new random sample, or use existing list if (args.email_list==None): print 'Creating random sample' text, email_ids = enron.querySample(args.fraction, return_sample = True) with open(os.path.join(folderpath,'email_sample.log'), 'w') as elog: for id in email_ids: elog.write('{0}\n'.format(id)) else: print 'Using existing sample ids' with open(args.email_list, 'r') as einput: email_sample = einput.readlines() email_sample = [e.strip('\n') for e in email_sample] con,cur = enron.connectDB('enron') text = [] for e_id in email_sample: cur.execute(" select text from emails where id = {0} ".format(e_id)) tmp=cur.fetchall() text.append(tmp[0][0]) con.close() #make email log file anyway with open(os.path.join(folderpath,'email_sample.log'), 'w') as elog: elog.write('Email sample duplicated from {0}\n'.format(args.email_list)) for e_id in email_sample: elog.write('{0}\n'.format(e_id)) text = [t.lower() for t in text] text = ' '.join(text) if (args.abbrev == True): if os.path.exists("word_replace_dic.txt"): os.remove("word_replace_dic.txt") print "Replacing technical terms..." text=words.abbreviations(text,"dic_enron.csv") if (args.ngrams == True): if os.path.exists("ngrams_found.txt"): os.remove("ngrams_found.txt") print "Joining ngrams..." text=words.ngramsText(text,3,"bigrams.txt","trigrams.txt") token_args = [ text, text, text ] token_kwargs = [ {}, {}, {} ] stem_kwargs = [ {}, {}, {}, {}, {} ] #loop over each version for (tcommand, targ, tkwarg) in zip(token_command, token_args, token_kwargs): for (scommand, skwarg) in zip(stem_command, stem_kwargs): n1 = tcommand[0] n2 = getFunctionName(tcommand[1])+'.'+tcommand[2] n3 = scommand[0] n4 = getFunctionName(scommand[1])+'.'+scommand[2] output = os.path.join(folderpath,'testing_{0}.{1}_{2}.{3}.csv'.format(n1,n2,n3,n4)) print 'Currently working on {0}.{1} with {2}.{3}'.format(n1,n2,n3,n4) start_code = time.time() p = __import__(tcommand[0]) exec tcommand[1] text_token = list(getattr(f, tcommand[2])(targ,**tkwarg)) #tokenising complete text_token = cleanTokens(text_token) text_token = [x for x in text_token if x not in stop_words] q = __import__(scommand[0]) exec scommand[1] if scommand[0] == 'gensim': text_stem = getattr(g, scommand[2])(unicode(text_token)) else: text_stem = [getattr(g, scommand[2])(word) for word in text_token] end_code = time.time() codetime = end_code - start_code print 'Total time for set-up: {0}'.format(codetime) start_write = time.time() with open(os.path.join(folderpath,output), "wb") as f: writer = csv.writer(f) writer.writerows([text_stem]) end_write = time.time() writetime = end_write - start_write print 'Total time for write out: {0}'.format(writetime) timinglog.write("{0}.{1}\t{2}.{3}\t{4}\t{5}\n".format(n1, n2, n3, n4, codetime, writetime)) timinglog.close()
def main(): args = parser.parse_args() #First thing: create the DB # Check that the DB doesn't already exist, and if it does, delete it. Comment this later, I just inserted this for now in case more than one attempt to create the DB was required. #con = mdb.connect('localhost', 'kpmg1', 's2ds') #cur=con.cursor() #deleteDB(cur,'enron') #con.close() createDB() connection, cursor = enron.connectDB('enron') #start directory is unique for each person startdir = args.startdir hashlist = [] duplicate_log = open('duplicate_log.txt', 'w') filecount = 0 duplicate_count = 0 print 'Walking the directory tree (this takes a while)....' for dir, subdir, files in os.walk(startdir): for ff in files: filepath = os.path.join(dir, ff) #calculate hash with open(filepath, 'r') as efile: msglines = efile.readlines() msg2 = [ x for x in msglines if not x.startswith('Message-ID') and not x.startswith('X-Folder') ] msg2 = ''.join(msg2) m = hashlib.md5() m.update(msg2) if m.hexdigest() not in hashlist: hashlist.append(m.hexdigest()) msg = email.message_from_string(''.join(msglines)) addDBEntry(connection, cursor, 'emails', msg, filepath) filecount += 1 else: 'Duplicate message found {0}'.format(filepath) duplicate_log.write(m.hexdigest() + '\t' + filepath + '\n') duplicate_count += 1 connection.close() duplicate_log.close() print '{0} entries added to the database'.format(filecount) print '{0} files discounted as duplicates'.format(duplicate_count)