Esempi in Python per get_terms, esempi in Python per term_tools.get_terms

Esempio n. 1

0

Mostra file

File: tfidf.py Progetto: fsosa/lab6

def sender_term_pairs(email):
  sender = email['sender']
  master_email = sender

  if sender in email_to_master.value:
    master_email = email_to_master.value[sender]

  return map(lambda x: {'sender': master_email, 'term': x}, get_terms(email['text']))

Esempio n. 2

0

Mostra file

def parse_file(filename):
    words = defaultdict(lambda: 0)

    with open(filename) as input:
        for line in input:
            email = JSONValueProtocol.read(line)[1]
            for term in get_terms(email['text']):
                words[term] += 1

        for word, count in words.items():
            print word, count

Esempio n. 3

0

Mostra file

File: mr_per-term-idf.py Progetto: fsosa/asciiclass

    def mapper(self, key, email):
	# Only iterate over the unique terms to avoid double-counting
	terms = get_terms(email['text'])
	unique_terms = set(terms)
        for term in unique_terms: 
            yield term, 1

Esempio n. 4

0

Mostra file

File: exercise2_mr_wc_sender.py Progetto: myw/dataiap

 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield (email['sender'], term), 1

Esempio n. 5

0

Mostra file

 def mapper(self, key, email):
     terms = set(get_terms(email['text']))
     for term in terms:
         yield term, 1

Esempio n. 6

0

Mostra file

File: mr_tf.py Progetto: jeffchan/asciiclass

 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield (email['sender'], term), 1

Esempio n. 7

0

Mostra file

File: tfidf.py Progetto: emersongreen/asciiclass

	def map_counts(self, key, email):
		# First let's count terms and authorss
	    for term in get_terms(email['text']):
	        yield {'term': term, 'author': email['sender']}, 1

Esempio n. 8

0

Mostra file

File: MRTermIDF.py Progetto: myw/dataiap

 def mapper(self, key, email):
     for term in set(get_terms(email['text'])):
         yield term, 1

Esempio n. 9

0

Mostra file

File: mr_tfidf_per_sender.py Progetto: igorkovski/dataiap

 def mapper(self, key, email):
     for term in get_terms(email["text"]):
         yield {"term": term, "sender": email["sender"]}, 1

Esempio n. 10

0

Mostra file

File: tf-idf.py Progetto: jeffchan/asciiclass

def term_sender_pairs(email):
	sender = email['sender']
	master_email = sender #default
	if sender in send_to_master.value:
		master_email = send_to_master.value[sender]
	return map(lambda x: {'term': x, 'sender': master_email}, get_terms(email['text']))

Esempio n. 11

0

Mostra file

File: tf-idf.py Progetto: jeffchan/asciiclass

### Disambiguate ###
# Get a list of unique sender emails from the corpus
unique_senders = json_lay.map(lambda x: x['sender']).distinct()

# Sort by the first letter of names (where name is the stuff before the @ symbol)
sorted_names = unique_senders.map(lambda x: (x,x.split('@')[0])).groupBy(lambda x: x[1][0], 500)

# Create a master dictionary of consolidated emails. Example '*****@*****.**' => ['*****@*****.**', '*****@*****.**']
combined = sorted_names.flatMap(consolidate_emails).flatMap(lambda x: map(lambda y: (y, x[0]), x[1])).collectAsMap()

# Broadcast to all nodes
send_to_master = sc.broadcast(combined)


### Calculate per-term IDF ###
term_count = json_lay.flatMap(lambda email: get_terms(email['text'])).map(lambda term: (term, 1)).reduceByKey(add)
per_term_idf = term_count.map(lambda term: (term[0], math.log(516893.0 / term[1]))).cache()

### Get term/sender pairs ###
term_sender_pairing = json_lay.flatMap(term_sender_pairs).groupBy(lambda x: x['term'], 500)

### Find sender-term freq ###
sender_tf = term_sender_pairing.flatMap(sender_term_freq).cache()

def tfidf_map(sender_tf):
	(term, sender_count_freq) = sender_tf
	(sender_count, freq) = sender_count_freq
	(sender,count) = sender_count
	return {'sender': sender 'term': term, 'tf-idf':count*freq}

### Find TF-IDF ###

Esempio n. 12

0

Mostra file

File: mr_sender.py Progetto: colin2328/asciiclass

	def mapper1(self, key, email):
		for term in get_terms(email['text']): #terms is a list of terms
			yield {'sender': email['sender'], 'term': term}, 1

Esempio n. 13

0

Mostra file

File: mr_sender.py Progetto: colin2328/asciiclass

 def mapper1(self, key, email):
     for term in get_terms(email['text']):  #terms is a list of terms
         yield {'sender': email['sender'], 'term': term}, 1

Esempio n. 14

0

Mostra file

File: simple_wordcount.py Progetto: Ericbaba/MIT_dataiap

import re
import sys
from collections import defaultdict
from mrjob.protocol import JSONValueProtocol
from term_tools import get_terms

input = open(sys.argv[1])
words = defaultdict(lambda: 0)
for line in input:
    email = JSONValueProtocol.read(line)[1]
    for term in get_terms(email['text']):
        words[term] += 1

for word, count in words.items():
    print word, count

Esempio n. 15

0

Mostra file

File: mr_wordcount.py Progetto: colin2328/asciiclass

 def mapper(self, key, email):
     for term in get_terms(email['text']): #terms is a list of terms
         yield term, 1

Esempio n. 16

0

Mostra file

File: MRTFIDFBySender.py Progetto: myw/dataiap

 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield {'term': term, 'sender': email['sender']}, 1

Esempio n. 17

0

Mostra file

File: wordcount_mr.py Progetto: Ericbaba/MIT_dataiap

 def mapper(self, key, email):
     for term in get_terms(email['text']):
         yield term, 1

Esempio n. 18

0

Mostra file

File: tfidf.py Progetto: fsosa/lab6

#--- Disambiguation ---#
# Approach: 
# - Get a list of distinct emails from the data set
# - Sort them into groups based on the first letter of the name part of the email (i.e. before '@')
# - Create a dictionary of email to master where master is the email that we will consolidate our searches over
#     e.g [email protected] and [email protected] have as their master email [email protected] 
# - Broadcast to all nodes so that they know about it
unique_emails = json_corpus.map(lambda x: x['sender']).distinct()
lastnames = unique_emails.map(lambda x: (x,x.split("@")[0])).groupBy(lambda x: x[1][0], 500) 
consolidated = lastnames.flatMap(consolidate_emails).flatMap(lambda x: map(lambda y: (y, x[0]), x[1])).collectAsMap()
email_to_master = sc.broadcast(consolidated)

#----- Actual TF-IDF Calculation -----#
# Calculate per-term idf
term_counts = json_corpus.flatMap(lambda x: get_terms(x['text'])).map(lambda y: (y, 1)).reduceByKey(add)
per_term_idf = term_counts.map(lambda x: (x[0], math.log(516893.0 / x[1]))).cache()

# Get sender/term pairs
grouped_sender_term_pairs = json_corpus.flatMap(sender_term_pairs).groupBy(lambda x: x['term'], 500)

# Calculate sender-term frequency
sender_tf = grouped_sender_term_pairs.flatMap(sender_tf).cache()

#e.g. join: (u'talk', ((u'*****@*****.**', 3), 12.056978880153091))
tfidf = sender_tf.join(per_term_idf, 500).map(lambda x:{'sender': x[1][0][0], 'term':x[0], 'tf-idf':x[1][0][1]*x[1][1]})

output = tfidf.collect()
for x in output:
  print x

Esempio n. 19

0

Mostra file

File: tfidf_wordcount.py Progetto: Edward-Wei/mit-courses

 def mapper(self, key, email):
     email_terms = get_terms(email['text'])
     for term in email_terms:
         yield (term, email['mid']), (1, len(email_terms), email['sender'])

Esempio n. 20

0

Mostra file

File: mr_idf.py Progetto: jeffchan/asciiclass

 def mapper(self, key, email):
     terms = set(get_terms(email["text"]))
     for term in terms:
         yield term, 1

Esempio n. 21

0

Mostra file

File: tfidf_wordcount.py Progetto: yingtian991221/mit-courses

 def mapper(self, key, email):
     email_terms = get_terms(email['text'])
     for term in email_terms:
         yield (term, email['mid']), (1, len(email_terms), email['sender'])