Exemple #1
0
 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {
             '3': 4
         }),
         JSONValueProtocol().read(JSONValueProtocol().write(None, {3: 4})))
Exemple #2
0
    def test_bad_keys_and_values(self):
        # dictionaries have to have strings as keys
        self.assertCantEncode(JSONValueProtocol(), None, {(1, 2): 3})

        # only unicodes (or bytes in utf-8) are allowed
        self.assertCantEncode(JSONValueProtocol(), None, '\xe9')

        # sets don't exist in JSON
        self.assertCantEncode(JSONValueProtocol(), None, set())

        # Point class has no representation in JSON
        self.assertCantEncode(JSONValueProtocol(), None, Point(1, 4))
Exemple #3
0
 def reducer_init(self):
     self.idfs = {}
     for fname in os.listdir(DIRECTORY): # look through file names in the directory
         file = open(os.path.join(DIRECTORY, fname)) # open a file
         for line in file: # read each line in json file
             term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
             self.idfs[term_idf['term']] = term_idf['idf']
 def reducer_init(self):
     self.idfs = {}
     for fname in os.listdir(DIRECTORY): # look through file names in the directory
         file = open(os.path.join(DIRECTORY, fname)) # open a file
         for line in file: # read each line in json file
             term_idf = JSONValueProtocol().read(line)[1] # parse the line as a JSON object
             self.idfs[term_idf['term']] = term_idf['idf']
Exemple #5
0
def encode_document(doc_id, text):
    """Encode a document as a JSON dictionary so that MRNgramIDFUtility can read it.
    We intend to use `doc_id` as a business/product/entity ID rather than the ID of
    an individual review."""
    
    #text = unicode(text) RAWR some amazon reviews won't encode
    return JSONValueProtocol.write(
        None, {'doc_id': doc_id, 'text': text})
def root_to_json(root_dir, output_file):
    walker = EmailWalker(root_dir)
    output = open(output_file, "w")

    for email in walker:
        email['date'] = str(email['date'])
        line = JSONValueProtocol.write(None, email) + '\n'
        output.write(line)

    output.close()
def root_to_json(root_dir, output_file):
    walker = EmailWalker(root_dir)
    output = open(output_file, "w")

    for email in walker:
        email['date'] = str(email['date'])
        line = JSONValueProtocol.write(None, email) + '\n'
        output.write(line)

    output.close()
Exemple #8
0
def parse_file(filename):
    words = defaultdict(lambda: 0)

    with open(filename) as input:
        for line in input:
            email = JSONValueProtocol.read(line)[1]
            for term in get_terms(email['text']):
                words[term] += 1

        for word, count in words.items():
            print word, count
Exemple #9
0
    def data(self, minimum=1, **kw):
        res = []

        mr_job = MRWordFreqJSON()
        mr_job.stdin = [JSONValueProtocol().write(None, line) for line in TEXT]

        with mr_job.make_runner() as runner:
            runner.run()
            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                if int(value) >= int(minimum):
                    res.append([key, value])

        return dict(data=res)
def encode_document(text, cats=None, id=None):
    """Encode a document as a JSON so that MRTextClassifier can read it.

    Args:
    text -- the text of the document (as a unicode)
    cats -- a dictionary mapping a category name (e.g. 'sports') to True if
        the document is in the category, and False if it's not. None indicates
        that we have no information about this documents' categories
    id -- a unique ID for the document (any kind of JSON-able value should
        work). If not specified, we'll auto-generate one.
    """
    text = unicode(text, errors="ignore")
    cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems())

    return JSONValueProtocol.write(None, {"document": text, "cats": cats, "docid": id, "type": "document"}) + "\n"
    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(aws_access_key_id=self.options.aws_access_key_id,
                               aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()): 
                term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']
    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(
                aws_access_key_id=self.options.aws_access_key_id,
                aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()):
                term_idf = JSONValueProtocol.read(line)[
                    1]  # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']
Exemple #13
0
def encode_document(text, cats=None, id=None):
    """Encode a document as a JSON so that MRTextClassifier can read it.

    Args:
    text -- the text of the document (as a unicode)
    cats -- a dictionary mapping a category name (e.g. 'sports') to True if
        the document is in the category, and False if it's not. None indicates
        that we have no information about this documents' categories
    id -- a unique ID for the document (any kind of JSON-able value should
        work). If not specified, we'll auto-generate one.
    """
    text = unicode(text)
    cats = dict((unicode(cat), bool(is_in_cat))
                for cat, is_in_cat
                in (cats or {}).iteritems())

    return JSONValueProtocol.write(
        None, {'text': text, 'cats': cats, 'id': id}) + '\n'
 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {'3': 4}),
         JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
Exemple #17
0
#s3_input_path = "s3://joeloren//iceval_out//input//datasets//"
tmp_dir_out = "s3://joeloren/interim_out/"
tmp_dir_in = "s3://joeloren/interim_in/"
tmp_dir_in_relative = "interim_in/"
tmp_dir_out_relative = "interim_out/"

from mrjob.protocol import JSONValueProtocol, JSONProtocol
jvp = JSONValueProtocol()
jp = JSONProtocol()

from boto.s3.connection import S3Connection
import sys

c = S3Connection('AKIAI4OZ3HY56BTOHA3A',
                 '6isbkZjBM8kt3PIk53EXVIf76VOPxOH8rNleGc6B')

bucket = c.get_bucket("joeloren")
datasets_bucket = c.get_bucket('joel_datasets')
Exemple #18
0
 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
Exemple #19
0
    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
Exemple #20
0
    def test_uses_json_format(self):
        VALUE = {'foo': 'bar'}
        ENCODED = b'{"foo": "bar"}'

        self.assertEqual((None, VALUE), JSONValueProtocol().read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol().write(None, VALUE))
Exemple #21
0
 def test_bad_data(self):
     self.assertCantDecode(JSONValueProtocol(), '{@#$@#!^&*$%^')
Exemple #22
0
 def test_round_trip_with_trailing_tab(self):
     for _, v in JSON_KEYS_AND_VALUES:
         self.assertRoundTripWithTrailingTabOK(JSONValueProtocol(), None, v)
Exemple #23
0
 def test_round_trip(self):
     for _, v in JSON_KEYS_AND_VALUES:
         self.assertRoundTripOK(JSONValueProtocol(), None, v)
import re
import sys
from collections import defaultdict
from mrjob.protocol import JSONValueProtocol
from term_tools import get_terms

input = open(sys.argv[1])
words = defaultdict(lambda: 0)
for line in input:
    email = JSONValueProtocol.read(line)[1]
    for term in get_terms(email['text']):
        words[term] += 1

for word, count in words.items():
    print word, count