Example #1
0
 def reducer_init(self):
     self.idfs = {}
     for fname in os.listdir(DIRECTORY): # look through file names in the directory
         file = open(os.path.join(DIRECTORY, fname)) # open a file
         for line in file: # read each line in json file
             term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
             self.idfs[term_idf['term']] = term_idf['idf']
Example #2
0
 def reducer_init(self):
     self.idfs = {}
     for fname in os.listdir(DIRECTORY): # look through file names in the directory
         file = open(os.path.join(DIRECTORY, fname)) # open a file
         for line in file: # read each line in json file
             term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
             self.idfs[term_idf['term']] = term_idf['idf']
Example #3
0
def parse_file(filename):
    words = defaultdict(lambda: 0)

    with open(filename) as input:
        for line in input:
            email = JSONValueProtocol.read(line)[1]
            for term in get_terms(email['text']):
                words[term] += 1

        for word, count in words.items():
            print word, count
Example #4
0
    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(aws_access_key_id=self.options.aws_access_key_id,
                               aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()): 
                term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']
Example #5
0
    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(
                aws_access_key_id=self.options.aws_access_key_id,
                aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()):
                term_idf = JSONValueProtocol.read(line)[
                    1]  # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']
Example #6
0
 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {'3': 4}),
         JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
Example #7
0
 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
Example #8
0
    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
Example #9
0
 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {'3': 4}),
         JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
Example #10
0
 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
Example #11
0
    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
Example #12
0
import re
import sys
from collections import defaultdict
from mrjob.protocol import JSONValueProtocol
from term_tools import get_terms

input = open(sys.argv[1])
words = defaultdict(lambda: 0)
for line in input:
    email = JSONValueProtocol.read(line)[1]
    for term in get_terms(email['text']):
        words[term] += 1

for word, count in words.items():
    print word, count