Esempio n. 1
0
File: tfidf.py Progetto: fsosa/lab6
def sender_term_pairs(email):
  sender = email['sender']
  master_email = sender

  if sender in email_to_master.value:
    master_email = email_to_master.value[sender]

  return map(lambda x: {'sender': master_email, 'term': x}, get_terms(email['text']))
Esempio n. 2
0
def parse_file(filename):
    words = defaultdict(lambda: 0)

    with open(filename) as input:
        for line in input:
            email = JSONValueProtocol.read(line)[1]
            for term in get_terms(email['text']):
                words[term] += 1

        for word, count in words.items():
            print word, count
Esempio n. 3
0
    def mapper(self, key, email):
	# Only iterate over the unique terms to avoid double-counting
	terms = get_terms(email['text'])
	unique_terms = set(terms)
        for term in unique_terms: 
            yield term, 1
Esempio n. 4
0
 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield (email['sender'], term), 1
Esempio n. 5
0
 def mapper(self, key, email):
     terms = set(get_terms(email['text']))
     for term in terms:
         yield term, 1
Esempio n. 6
0
 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield (email['sender'], term), 1
Esempio n. 7
0
	def map_counts(self, key, email):
		# First let's count terms and authorss
	    for term in get_terms(email['text']):
	        yield {'term': term, 'author': email['sender']}, 1
Esempio n. 8
0
 def mapper(self, key, email):
     for term in set(get_terms(email['text'])):
         yield term, 1
Esempio n. 9
0
 def mapper(self, key, email):
     for term in get_terms(email["text"]):
         yield {"term": term, "sender": email["sender"]}, 1
Esempio n. 10
0
def term_sender_pairs(email):
	sender = email['sender']
	master_email = sender #default
	if sender in send_to_master.value:
		master_email = send_to_master.value[sender]
	return map(lambda x: {'term': x, 'sender': master_email}, get_terms(email['text']))
Esempio n. 11
0
### Disambiguate ###
# Get a list of unique sender emails from the corpus
unique_senders = json_lay.map(lambda x: x['sender']).distinct()

# Sort by the first letter of names (where name is the stuff before the @ symbol)
sorted_names = unique_senders.map(lambda x: (x,x.split('@')[0])).groupBy(lambda x: x[1][0], 500)

# Create a master dictionary of consolidated emails. Example '*****@*****.**' => ['*****@*****.**', '*****@*****.**']
combined = sorted_names.flatMap(consolidate_emails).flatMap(lambda x: map(lambda y: (y, x[0]), x[1])).collectAsMap()

# Broadcast to all nodes
send_to_master = sc.broadcast(combined)


### Calculate per-term IDF ###
term_count = json_lay.flatMap(lambda email: get_terms(email['text'])).map(lambda term: (term, 1)).reduceByKey(add)
per_term_idf = term_count.map(lambda term: (term[0], math.log(516893.0 / term[1]))).cache()

### Get term/sender pairs ###
term_sender_pairing = json_lay.flatMap(term_sender_pairs).groupBy(lambda x: x['term'], 500)

### Find sender-term freq ###
sender_tf = term_sender_pairing.flatMap(sender_term_freq).cache()

def tfidf_map(sender_tf):
	(term, sender_count_freq) = sender_tf
	(sender_count, freq) = sender_count_freq
	(sender,count) = sender_count
	return {'sender': sender 'term': term, 'tf-idf':count*freq}

### Find TF-IDF ###
Esempio n. 12
0
	def mapper1(self, key, email):
		for term in get_terms(email['text']): #terms is a list of terms
			yield {'sender': email['sender'], 'term': term}, 1
Esempio n. 13
0
 def mapper1(self, key, email):
     for term in get_terms(email['text']):  #terms is a list of terms
         yield {'sender': email['sender'], 'term': term}, 1
Esempio n. 14
0
import re
import sys
from collections import defaultdict
from mrjob.protocol import JSONValueProtocol
from term_tools import get_terms

input = open(sys.argv[1])
words = defaultdict(lambda: 0)
for line in input:
    email = JSONValueProtocol.read(line)[1]
    for term in get_terms(email['text']):
        words[term] += 1

for word, count in words.items():
    print word, count
Esempio n. 15
0
 def mapper(self, key, email):
     for term in get_terms(email['text']): #terms is a list of terms
         yield term, 1
Esempio n. 16
0
 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield {'term': term, 'sender': email['sender']}, 1
Esempio n. 17
0
 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield term, 1
Esempio n. 18
0
File: tfidf.py Progetto: fsosa/lab6
#--- Disambiguation ---#
# Approach: 
# - Get a list of distinct emails from the data set
# - Sort them into groups based on the first letter of the name part of the email (i.e. before '@')
# - Create a dictionary of email to master where master is the email that we will consolidate our searches over
#     e.g [email protected] and [email protected] have as their master email [email protected] 
# - Broadcast to all nodes so that they know about it
unique_emails = json_corpus.map(lambda x: x['sender']).distinct()
lastnames = unique_emails.map(lambda x: (x,x.split("@")[0])).groupBy(lambda x: x[1][0], 500) 
consolidated = lastnames.flatMap(consolidate_emails).flatMap(lambda x: map(lambda y: (y, x[0]), x[1])).collectAsMap()
email_to_master = sc.broadcast(consolidated)

#----- Actual TF-IDF Calculation -----#
# Calculate per-term idf
term_counts = json_corpus.flatMap(lambda x: get_terms(x['text'])).map(lambda y: (y, 1)).reduceByKey(add)
per_term_idf = term_counts.map(lambda x: (x[0], math.log(516893.0 / x[1]))).cache()

# Get sender/term pairs
grouped_sender_term_pairs = json_corpus.flatMap(sender_term_pairs).groupBy(lambda x: x['term'], 500)

# Calculate sender-term frequency
sender_tf = grouped_sender_term_pairs.flatMap(sender_tf).cache()

#e.g. join: (u'talk', ((u'*****@*****.**', 3), 12.056978880153091))
tfidf = sender_tf.join(per_term_idf, 500).map(lambda x:{'sender': x[1][0][0], 'term':x[0], 'tf-idf':x[1][0][1]*x[1][1]})

output = tfidf.collect()
for x in output:
  print x
Esempio n. 19
0
 def mapper(self, key, email):
     email_terms = get_terms(email['text'])
     for term in email_terms:
         yield (term, email['mid']), (1, len(email_terms), email['sender'])
Esempio n. 20
0
 def mapper(self, key, email):
     terms = set(get_terms(email["text"]))
     for term in terms:
         yield term, 1
 def mapper(self, key, email):
     email_terms = get_terms(email['text'])
     for term in email_terms:
         yield (term, email['mid']), (1, len(email_terms), email['sender'])