Ejemplo n.º 1
0
def main():

    topics=importTopics('corpus_min1_stopwdsTrue_all_tfidf_lsi_topics.pkl')
    #topics=importTopics('test_corpus_lsi_topics.pkl')
    #print topics[0][0]

    con, cur=enron.connectDB("enron")

    cur.execute("select id from emails order by id desc limit 1;")
    res = cur.fetchall()
    tmp = [int(col) for row in res for col in row]
    size=tmp[0]

    #pofD=1./float(size)
    #pofT=1./10.

    tot=0
    for id in range(1,size):
        cur.execute(" select text from emails where id = {0} ".format(id))
        tmp = cur.fetchall()
        text = enron.cleanString(enron.stripCharacters(tmp[0][0]))
        text_stem = stem.stemmingString(text, id, stopwords=True)
        #topicprob=pofTgivenD(text_stem,topics)*pofD/pofT 
        topicprob=pofTgivenD(text_stem,topics)     
        tot+=topicprob
        if topicprob>1.: print "ERROR: PROBABILITY LARGER THAN 1",id, topicprob
        if id % 1000 == 0: print "Email {0} processed, probability sum: {1}".format(id,tot)
        #print "Probability of generating email {0} from this topic set: {1}".format(id,topicprob)
    con.close()


    print "Final sum of probabilities:",tot
Ejemplo n.º 2
0
def sendersAndReceivers(sender):

    print 'Querying database'
    
    con,cur = enron.connectDB('enron')
    query = """select `sender`,`to`,`cc`,`id` from emails where `sender` like '%{0}%' or `to` like '%{0}%' 
            or `cc` like '%{0}%' limit 1000;""".format(sender)
    print query
    cur.execute(query)
    sendandrec = cur.fetchall()
    con.close()

    return sendandrec
Ejemplo n.º 3
0
def sendersAndReceivers(sender):

    print 'Querying database'

    con, cur = enron.connectDB('enron')
    query = """select `sender`,`to`,`cc`,`id` from emails where `sender` like '%{0}%' or `to` like '%{0}%' 
            or `cc` like '%{0}%' limit 1000;""".format(sender)
    print query
    cur.execute(query)
    sendandrec = cur.fetchall()
    con.close()

    return sendandrec
Ejemplo n.º 4
0
def indexLinks():

    con, cur=enron.connectDB("enron")
    cur.execute("select distinct `sender` from `emails`")

    tmp=cur.fetchall()
    tmp=[x[0].strip() for x in tmp]

    tmp_unique = list(set(tmp))
    
    cur.execute("select distinct `to` from `emails`")
    tmp1=cur.fetchall()
    
    print tmp1[0]
    #tmp1=[element.split(',') for element[0] in tmp1]
    #tmp1=(element for str(element).split(', ') in tmp1)

    tmp2 = [x[0].strip().split(',') for x in tmp1]
    tmp2 = [item.strip() for sublist in tmp2 for item in sublist]
    tmp2_unique = list(set(tmp2))

    cur.execute("select distinct `cc` from `emails`")
    tmp3=cur.fetchall()
    
    
    tmp4 = [x[0].strip().split(',') for x in tmp3]
    tmp4 = [item.strip() for sublist in tmp4 for item in sublist]
    tmp4_unique = list(set(tmp4))

    all_addresses = tmp_unique + tmp2_unique + tmp4_unique
    all_addresses_unique = list(set(all_addresses))
    all_addresses_unique.sort()
    con.close()
    address_file = open('addresses_all.txt', 'w')

    for idx,email in enumerate(all_addresses_unique):
        address_file.write("""{0}\t"{1}"\n""".format(idx,email))

    address_file.close()
        
    return list(enumerate(all_addresses_unique))
Ejemplo n.º 5
0
def indexLinks():

    con, cur = enron.connectDB("enron")
    cur.execute("select distinct `sender` from `emails`")

    tmp = cur.fetchall()
    tmp = [x[0].strip() for x in tmp]

    tmp_unique = list(set(tmp))

    cur.execute("select distinct `to` from `emails`")
    tmp1 = cur.fetchall()

    print tmp1[0]
    #tmp1=[element.split(',') for element[0] in tmp1]
    #tmp1=(element for str(element).split(', ') in tmp1)

    tmp2 = [x[0].strip().split(',') for x in tmp1]
    tmp2 = [item.strip() for sublist in tmp2 for item in sublist]
    tmp2_unique = list(set(tmp2))

    cur.execute("select distinct `cc` from `emails`")
    tmp3 = cur.fetchall()

    tmp4 = [x[0].strip().split(',') for x in tmp3]
    tmp4 = [item.strip() for sublist in tmp4 for item in sublist]
    tmp4_unique = list(set(tmp4))

    all_addresses = tmp_unique + tmp2_unique + tmp4_unique
    all_addresses_unique = list(set(all_addresses))
    all_addresses_unique.sort()
    con.close()
    address_file = open('addresses_all.txt', 'w')

    for idx, email in enumerate(all_addresses_unique):
        address_file.write("""{0}\t"{1}"\n""".format(idx, email))

    address_file.close()

    return list(enumerate(all_addresses_unique))
Ejemplo n.º 6
0
def main():


    args = parser.parse_args()
    print args

    connection, cursor = enron.connectDB(args.name)

    cursor.execute("select ID from emails order by id desc limit 1;")
    #cursor.execute("select ID from {0} order by id desc limit 1;".format(args.table))
    numrows = int(cursor.fetchone()[0])

    #loop over number of rows
    #this is less efficient than operating on the database as a whole
    #but it won't make your computer slow down and explode

    
    for id in range(1, numrows+1):
        
        #fetch the rawtext

        cursor.execute("select rawtext from emails where id = {0}".format(id))
        rawtext = cursor.fetchone()[0]

        cleantext = ultraClean(rawtext)

        cleantext_escape = mdb.escape_string(cleantext)

        query = """UPDATE emails set {0}='{1}' where `id` =  {2};""".format(args.column, cleantext_escape, id)
        #query = """UPDATE {0} set {1}='{2}' where `id` =  {3};""".format(args.table, args.column, cleantext_escape, id)
        
        cursor.execute(query)
        connection.commit()

        print 'Updated entry {0}'.format(id)


    connection.close()
Ejemplo n.º 7
0
def main():

    args = parser.parse_args()
    print args

    connection, cursor = enron.connectDB(args.name)

    cursor.execute("select ID from emails order by id desc limit 1;")
    #cursor.execute("select ID from {0} order by id desc limit 1;".format(args.table))
    numrows = int(cursor.fetchone()[0])

    #loop over number of rows
    #this is less efficient than operating on the database as a whole
    #but it won't make your computer slow down and explode

    for id in range(1, numrows + 1):

        #fetch the rawtext

        cursor.execute("select rawtext from emails where id = {0}".format(id))
        rawtext = cursor.fetchone()[0]

        cleantext = ultraClean(rawtext)

        cleantext_escape = mdb.escape_string(cleantext)

        query = """UPDATE emails set {0}='{1}' where `id` =  {2};""".format(
            args.column, cleantext_escape, id)
        #query = """UPDATE {0} set {1}='{2}' where `id` =  {3};""".format(args.table, args.column, cleantext_escape, id)

        cursor.execute(query)
        connection.commit()

        print 'Updated entry {0}'.format(id)

    connection.close()
Ejemplo n.º 8
0
def main():
 
    args = parser.parse_args()

    #First thing: create the DB

    # Check that the DB doesn't already exist, and if it does, delete it. Comment this later, I just inserted this for now in case more than one attempt to create the DB was required.
    #con = mdb.connect('localhost', 'kpmg1', 's2ds')
    #cur=con.cursor()
    #deleteDB(cur,'enron')
    #con.close()


    createDB()
    connection, cursor = enron.connectDB('enron')




    #start directory is unique for each person

    startdir = args.startdir


    hashlist=[]

    duplicate_log = open('duplicate_log.txt', 'w')

    filecount = 0
    duplicate_count = 0


    print 'Walking the directory tree (this takes a while)....'

    for dir,subdir,files in os.walk(startdir):

        for ff in files:

            filepath = os.path.join(dir,ff)

            #calculate hash

            with open(filepath, 'r') as efile:
                msglines = efile.readlines()
            msg2 = [x for x in msglines if not x.startswith('Message-ID') and not x.startswith('X-Folder')]
            msg2 = ''.join(msg2)
            m = hashlib.md5()
            m.update(msg2)

            if m.hexdigest() not in hashlist:

                hashlist.append(m.hexdigest())

                msg = email.message_from_string(''.join(msglines))

                addDBEntry(connection,cursor, 'emails', msg, filepath)
                filecount+=1


            else:

                'Duplicate message found {0}'.format(filepath)
                duplicate_log.write(m.hexdigest()+'\t'+filepath+'\n')
                duplicate_count+=1



    connection.close()
    duplicate_log.close()

    print '{0} entries added to the database'.format(filecount)
    print '{0} files discounted as duplicates'.format(duplicate_count)
Ejemplo n.º 9
0
def main():

    args = parser.parse_args()

    rootdir=os.getcwd()
    foldername=args.directory
    folderpath=os.path.join(rootdir,foldername)
    if (os.path.exists(folderpath)==True):
        shutil.rmtree(folderpath)
        os.makedirs(folderpath)
    else:
        os.makedirs(folderpath)

    stop_words = enron.getCustomStopwords()


    

    timinglog = open(os.path.join(folderpath,args.output_timelog), 'w')

    timinglog.write('#Tokeniser Stemmer/Lemmatiser Codetime Writetime\n')
    
    
    
    # NB if you make changes here also do it below for the args/kwargs


    token_command = [
    				["nltk", "f = p.tokenize.WordPunctTokenizer()", "tokenize"],
    				["nltk", "f = p.tokenize.PunktWordTokenizer()", "tokenize"],
    		      	["gensim", "f = p.utils", "tokenize"]

    ]
    
    stem_command = [
    				["nltk", "g = q.stem.snowball.EnglishStemmer()", "stem"],
    				["nltk", "g = q.stem.snowball.PorterStemmer()", "stem"],
    				["nltk", "g = q.stem.lancaster.LancasterStemmer()", "stem"],
    	       	   	["nltk", "g = q.stem.WordNetLemmatizer()", "lemmatize"],
    				["gensim", "g = q.utils", "lemmatize"]
    ]


    #Either get text as new random sample, or use existing list

    if (args.email_list==None):

        print 'Creating random sample'

        text, email_ids = enron.querySample(args.fraction, return_sample = True)

        with open(os.path.join(folderpath,'email_sample.log'), 'w') as elog:

            for id in email_ids:

                elog.write('{0}\n'.format(id))

    else:

        print 'Using existing sample ids'

        with open(args.email_list, 'r') as einput:

            email_sample = einput.readlines()

        email_sample = [e.strip('\n') for e in email_sample]

        con,cur = enron.connectDB('enron')

        text = []

        for e_id in email_sample:

            cur.execute(" select text from emails where id = {0} ".format(e_id))
            tmp=cur.fetchall()
            text.append(tmp[0][0])

        con.close()


        #make email log file anyway

        with open(os.path.join(folderpath,'email_sample.log'), 'w') as elog:

            elog.write('Email sample duplicated from {0}\n'.format(args.email_list))

            for e_id in email_sample:

                elog.write('{0}\n'.format(e_id))

        



    text = [t.lower() for t in text]

    text = ' '.join(text)

    if (args.abbrev == True):

        if os.path.exists("word_replace_dic.txt"):
            os.remove("word_replace_dic.txt")

        print "Replacing technical terms..."
        text=words.abbreviations(text,"dic_enron.csv")




    if (args.ngrams == True):

        if os.path.exists("ngrams_found.txt"):
            os.remove("ngrams_found.txt")
        print "Joining ngrams..."
        text=words.ngramsText(text,3,"bigrams.txt","trigrams.txt")


    token_args = [
                text,
                text, 
                text
                ]
    token_kwargs = [
                {},
                {},
                {}
                ]
    
    stem_kwargs = [
                {}, 
                {}, 
                {}, 
                {}, 
                {}
                ]

    
    #loop over each version


    for (tcommand, targ, tkwarg) in zip(token_command, token_args, token_kwargs):
    

        for (scommand, skwarg) in zip(stem_command, stem_kwargs):

            n1 = tcommand[0]
            n2 = getFunctionName(tcommand[1])+'.'+tcommand[2]
            n3 = scommand[0]
            n4 = getFunctionName(scommand[1])+'.'+scommand[2]
    
            output = os.path.join(folderpath,'testing_{0}.{1}_{2}.{3}.csv'.format(n1,n2,n3,n4))

            print 'Currently working on {0}.{1} with {2}.{3}'.format(n1,n2,n3,n4)
        

            start_code = time.time()

            p = __import__(tcommand[0])
            exec tcommand[1]
            text_token = list(getattr(f, tcommand[2])(targ,**tkwarg))

            #tokenising complete

            text_token = cleanTokens(text_token)

            text_token = [x for x in text_token if x not in stop_words]
    
            q = __import__(scommand[0])
            exec scommand[1]
    
            if scommand[0] == 'gensim':
    
            	text_stem  = getattr(g, scommand[2])(unicode(text_token))
    
            
            else:

    
            	text_stem = [getattr(g, scommand[2])(word) for word in text_token]
    
    
            
    
            end_code = time.time()

            codetime = end_code - start_code

            print 'Total time for set-up: {0}'.format(codetime)

            start_write = time.time()

            with open(os.path.join(folderpath,output), "wb") as f:
                writer = csv.writer(f)  
                writer.writerows([text_stem])

            end_write = time.time()

            writetime = end_write - start_write
    
            print 'Total time for write out: {0}'.format(writetime)

            timinglog.write("{0}.{1}\t{2}.{3}\t{4}\t{5}\n".format(n1, n2, n3, n4, codetime, writetime))

    timinglog.close()
Ejemplo n.º 10
0
def main():

    args = parser.parse_args()

    #First thing: create the DB

    # Check that the DB doesn't already exist, and if it does, delete it. Comment this later, I just inserted this for now in case more than one attempt to create the DB was required.
    #con = mdb.connect('localhost', 'kpmg1', 's2ds')
    #cur=con.cursor()
    #deleteDB(cur,'enron')
    #con.close()

    createDB()
    connection, cursor = enron.connectDB('enron')

    #start directory is unique for each person

    startdir = args.startdir

    hashlist = []

    duplicate_log = open('duplicate_log.txt', 'w')

    filecount = 0
    duplicate_count = 0

    print 'Walking the directory tree (this takes a while)....'

    for dir, subdir, files in os.walk(startdir):

        for ff in files:

            filepath = os.path.join(dir, ff)

            #calculate hash

            with open(filepath, 'r') as efile:
                msglines = efile.readlines()
            msg2 = [
                x for x in msglines if not x.startswith('Message-ID')
                and not x.startswith('X-Folder')
            ]
            msg2 = ''.join(msg2)
            m = hashlib.md5()
            m.update(msg2)

            if m.hexdigest() not in hashlist:

                hashlist.append(m.hexdigest())

                msg = email.message_from_string(''.join(msglines))

                addDBEntry(connection, cursor, 'emails', msg, filepath)
                filecount += 1

            else:

                'Duplicate message found {0}'.format(filepath)
                duplicate_log.write(m.hexdigest() + '\t' + filepath + '\n')
                duplicate_count += 1

    connection.close()
    duplicate_log.close()

    print '{0} entries added to the database'.format(filecount)
    print '{0} files discounted as duplicates'.format(duplicate_count)