def sender_term_pairs(email): sender = email['sender'] master_email = sender if sender in email_to_master.value: master_email = email_to_master.value[sender] return map(lambda x: {'sender': master_email, 'term': x}, get_terms(email['text']))
def parse_file(filename): words = defaultdict(lambda: 0) with open(filename) as input: for line in input: email = JSONValueProtocol.read(line)[1] for term in get_terms(email['text']): words[term] += 1 for word, count in words.items(): print word, count
def mapper(self, key, email): # Only iterate over the unique terms to avoid double-counting terms = get_terms(email['text']) unique_terms = set(terms) for term in unique_terms: yield term, 1
def mapper(self, key, email): for term in get_terms(email['text']): yield (email['sender'], term), 1
def mapper(self, key, email): terms = set(get_terms(email['text'])) for term in terms: yield term, 1
def map_counts(self, key, email): # First let's count terms and authorss for term in get_terms(email['text']): yield {'term': term, 'author': email['sender']}, 1
def mapper(self, key, email): for term in set(get_terms(email['text'])): yield term, 1
def mapper(self, key, email): for term in get_terms(email["text"]): yield {"term": term, "sender": email["sender"]}, 1
def term_sender_pairs(email): sender = email['sender'] master_email = sender #default if sender in send_to_master.value: master_email = send_to_master.value[sender] return map(lambda x: {'term': x, 'sender': master_email}, get_terms(email['text']))
### Disambiguate ### # Get a list of unique sender emails from the corpus unique_senders = json_lay.map(lambda x: x['sender']).distinct() # Sort by the first letter of names (where name is the stuff before the @ symbol) sorted_names = unique_senders.map(lambda x: (x,x.split('@')[0])).groupBy(lambda x: x[1][0], 500) # Create a master dictionary of consolidated emails. Example '*****@*****.**' => ['*****@*****.**', '*****@*****.**'] combined = sorted_names.flatMap(consolidate_emails).flatMap(lambda x: map(lambda y: (y, x[0]), x[1])).collectAsMap() # Broadcast to all nodes send_to_master = sc.broadcast(combined) ### Calculate per-term IDF ### term_count = json_lay.flatMap(lambda email: get_terms(email['text'])).map(lambda term: (term, 1)).reduceByKey(add) per_term_idf = term_count.map(lambda term: (term[0], math.log(516893.0 / term[1]))).cache() ### Get term/sender pairs ### term_sender_pairing = json_lay.flatMap(term_sender_pairs).groupBy(lambda x: x['term'], 500) ### Find sender-term freq ### sender_tf = term_sender_pairing.flatMap(sender_term_freq).cache() def tfidf_map(sender_tf): (term, sender_count_freq) = sender_tf (sender_count, freq) = sender_count_freq (sender,count) = sender_count return {'sender': sender 'term': term, 'tf-idf':count*freq} ### Find TF-IDF ###
def mapper1(self, key, email): for term in get_terms(email['text']): #terms is a list of terms yield {'sender': email['sender'], 'term': term}, 1
import re import sys from collections import defaultdict from mrjob.protocol import JSONValueProtocol from term_tools import get_terms input = open(sys.argv[1]) words = defaultdict(lambda: 0) for line in input: email = JSONValueProtocol.read(line)[1] for term in get_terms(email['text']): words[term] += 1 for word, count in words.items(): print word, count
def mapper(self, key, email): for term in get_terms(email['text']): #terms is a list of terms yield term, 1
def mapper(self, key, email): for term in get_terms(email['text']): yield {'term': term, 'sender': email['sender']}, 1
def mapper(self, key, email): for term in get_terms(email['text']): yield term, 1
#--- Disambiguation ---# # Approach: # - Get a list of distinct emails from the data set # - Sort them into groups based on the first letter of the name part of the email (i.e. before '@') # - Create a dictionary of email to master where master is the email that we will consolidate our searches over # e.g [email protected] and [email protected] have as their master email [email protected] # - Broadcast to all nodes so that they know about it unique_emails = json_corpus.map(lambda x: x['sender']).distinct() lastnames = unique_emails.map(lambda x: (x,x.split("@")[0])).groupBy(lambda x: x[1][0], 500) consolidated = lastnames.flatMap(consolidate_emails).flatMap(lambda x: map(lambda y: (y, x[0]), x[1])).collectAsMap() email_to_master = sc.broadcast(consolidated) #----- Actual TF-IDF Calculation -----# # Calculate per-term idf term_counts = json_corpus.flatMap(lambda x: get_terms(x['text'])).map(lambda y: (y, 1)).reduceByKey(add) per_term_idf = term_counts.map(lambda x: (x[0], math.log(516893.0 / x[1]))).cache() # Get sender/term pairs grouped_sender_term_pairs = json_corpus.flatMap(sender_term_pairs).groupBy(lambda x: x['term'], 500) # Calculate sender-term frequency sender_tf = grouped_sender_term_pairs.flatMap(sender_tf).cache() #e.g. join: (u'talk', ((u'*****@*****.**', 3), 12.056978880153091)) tfidf = sender_tf.join(per_term_idf, 500).map(lambda x:{'sender': x[1][0][0], 'term':x[0], 'tf-idf':x[1][0][1]*x[1][1]}) output = tfidf.collect() for x in output: print x
def mapper(self, key, email): email_terms = get_terms(email['text']) for term in email_terms: yield (term, email['mid']), (1, len(email_terms), email['sender'])
def mapper(self, key, email): terms = set(get_terms(email["text"])) for term in terms: yield term, 1