Python JSONValueProtocol Exemples, mrjob.protocol.JSONValueProtocol Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_protocol.py Projet : zwd199032/mrjob

 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {
             '3': 4
         }),
         JSONValueProtocol().read(JSONValueProtocol().write(None, {3: 4})))

Exemple #2

0

Afficher le fichier

Fichier : test_protocol.py Projet : zwd199032/mrjob

    def test_bad_keys_and_values(self):
        # dictionaries have to have strings as keys
        self.assertCantEncode(JSONValueProtocol(), None, {(1, 2): 3})

        # only unicodes (or bytes in utf-8) are allowed
        self.assertCantEncode(JSONValueProtocol(), None, '\xe9')

        # sets don't exist in JSON
        self.assertCantEncode(JSONValueProtocol(), None, set())

        # Point class has no representation in JSON
        self.assertCantEncode(JSONValueProtocol(), None, Point(1, 4))

Exemple #3

0

Afficher le fichier

Fichier : mr_wc.py Projet : abhishek-ch/evolveML

 def reducer_init(self):
     self.idfs = {}
     for fname in os.listdir(DIRECTORY): # look through file names in the directory
         file = open(os.path.join(DIRECTORY, fname)) # open a file
         for line in file: # read each line in json file
             term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
             self.idfs[term_idf['term']] = term_idf['idf']

Exemple #4

0

Afficher le fichier

Fichier : mr_tfidf_per_sender.py Projet : zhang-yd15/Data-Mining-Python

 def reducer_init(self):
     self.idfs = {}
     for fname in os.listdir(DIRECTORY): # look through file names in the directory
         file = open(os.path.join(DIRECTORY, fname)) # open a file
         for line in file: # read each line in json file
             term_idf = JSONValueProtocol().read(line)[1] # parse the line as a JSON object
             self.idfs[term_idf['term']] = term_idf['idf']

Exemple #5

0

Afficher le fichier

Fichier : mr_ngram_idf.py Projet : harlandu/thumbsup

def encode_document(doc_id, text):
    """Encode a document as a JSON dictionary so that MRNgramIDFUtility can read it.
    We intend to use `doc_id` as a business/product/entity ID rather than the ID of
    an individual review."""
    
    #text = unicode(text) RAWR some amazon reviews won't encode
    return JSONValueProtocol.write(
        None, {'doc_id': doc_id, 'text': text})

Exemple #6

0

Afficher le fichier

Fichier : emails_to_json.py Projet : Ericbaba/MIT_dataiap

def root_to_json(root_dir, output_file):
    walker = EmailWalker(root_dir)
    output = open(output_file, "w")

    for email in walker:
        email['date'] = str(email['date'])
        line = JSONValueProtocol.write(None, email) + '\n'
        output.write(line)

    output.close()

Exemple #7

0

Afficher le fichier

Fichier : emails_to_json.py Projet : DevasenaInupakutika/MIT_OCW_DATASCIENCE

def root_to_json(root_dir, output_file):
    walker = EmailWalker(root_dir)
    output = open(output_file, "w")

    for email in walker:
        email['date'] = str(email['date'])
        line = JSONValueProtocol.write(None, email) + '\n'
        output.write(line)

    output.close()

Exemple #8

0

Afficher le fichier

def parse_file(filename):
    words = defaultdict(lambda: 0)

    with open(filename) as input:
        for line in input:
            email = JSONValueProtocol.read(line)[1]
            for term in get_terms(email['text']):
                words[term] += 1

        for word, count in words.items():
            print word, count

Exemple #9

0

Afficher le fichier

    def data(self, minimum=1, **kw):
        res = []

        mr_job = MRWordFreqJSON()
        mr_job.stdin = [JSONValueProtocol().write(None, line) for line in TEXT]

        with mr_job.make_runner() as runner:
            runner.run()
            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                if int(value) >= int(minimum):
                    res.append([key, value])

        return dict(data=res)

Exemple #10

0

Afficher le fichier

Fichier : encode_emails.py Projet : TanayGahlot/HowDataAffectDecision

def encode_document(text, cats=None, id=None):
    """Encode a document as a JSON so that MRTextClassifier can read it.

    Args:
    text -- the text of the document (as a unicode)
    cats -- a dictionary mapping a category name (e.g. 'sports') to True if
        the document is in the category, and False if it's not. None indicates
        that we have no information about this documents' categories
    id -- a unique ID for the document (any kind of JSON-able value should
        work). If not specified, we'll auto-generate one.
    """
    text = unicode(text, errors="ignore")
    cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems())

    return JSONValueProtocol.write(None, {"document": text, "cats": cats, "docid": id, "type": "document"}) + "\n"

Exemple #11

0

Afficher le fichier

Fichier : mr_tfidf_per_sender_aws.py Projet : myw/dataiap

    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(aws_access_key_id=self.options.aws_access_key_id,
                               aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()): 
                term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']

Exemple #12

0

Afficher le fichier

Fichier : mr_tfidf_per_sender_aws.py Projet : myw/dataiap

    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(
                aws_access_key_id=self.options.aws_access_key_id,
                aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()):
                term_idf = JSONValueProtocol.read(line)[
                    1]  # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']

Exemple #13

0

Afficher le fichier

def encode_document(text, cats=None, id=None):
    """Encode a document as a JSON so that MRTextClassifier can read it.

    Args:
    text -- the text of the document (as a unicode)
    cats -- a dictionary mapping a category name (e.g. 'sports') to True if
        the document is in the category, and False if it's not. None indicates
        that we have no information about this documents' categories
    id -- a unique ID for the document (any kind of JSON-able value should
        work). If not specified, we'll auto-generate one.
    """
    text = unicode(text)
    cats = dict((unicode(cat), bool(is_in_cat))
                for cat, is_in_cat
                in (cats or {}).iteritems())

    return JSONValueProtocol.write(
        None, {'text': text, 'cats': cats, 'id': id}) + '\n'

Exemple #14

0

Afficher le fichier

Fichier : test_protocol.py Projet : leafsummer/keeplearning

 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {'3': 4}),
         JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))

Exemple #15

0

Afficher le fichier

Fichier : test_protocol.py Projet : leafsummer/keeplearning

 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))

Exemple #16

0

Afficher le fichier

Fichier : test_protocol.py Projet : leafsummer/keeplearning

    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))

Exemple #17

0

Afficher le fichier

#s3_input_path = "s3://joeloren//iceval_out//input//datasets//"
tmp_dir_out = "s3://joeloren/interim_out/"
tmp_dir_in = "s3://joeloren/interim_in/"
tmp_dir_in_relative = "interim_in/"
tmp_dir_out_relative = "interim_out/"

from mrjob.protocol import JSONValueProtocol, JSONProtocol
jvp = JSONValueProtocol()
jp = JSONProtocol()

from boto.s3.connection import S3Connection
import sys

c = S3Connection('AKIAI4OZ3HY56BTOHA3A',
                 '6isbkZjBM8kt3PIk53EXVIf76VOPxOH8rNleGc6B')

bucket = c.get_bucket("joeloren")
datasets_bucket = c.get_bucket('joel_datasets')

Exemple #18

0

Afficher le fichier

 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))

Exemple #19

0

Afficher le fichier

    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))

Exemple #20

0

Afficher le fichier

    def test_uses_json_format(self):
        VALUE = {'foo': 'bar'}
        ENCODED = b'{"foo": "bar"}'

        self.assertEqual((None, VALUE), JSONValueProtocol().read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol().write(None, VALUE))

Exemple #21

0

Afficher le fichier

Fichier : test_protocol.py Projet : zwd199032/mrjob

 def test_bad_data(self):
     self.assertCantDecode(JSONValueProtocol(), '{@#$@#!^&*$%^')

Exemple #22

0

Afficher le fichier

Fichier : test_protocol.py Projet : zwd199032/mrjob

 def test_round_trip_with_trailing_tab(self):
     for _, v in JSON_KEYS_AND_VALUES:
         self.assertRoundTripWithTrailingTabOK(JSONValueProtocol(), None, v)

Exemple #23

0

Afficher le fichier

Fichier : test_protocol.py Projet : zwd199032/mrjob

 def test_round_trip(self):
     for _, v in JSON_KEYS_AND_VALUES:
         self.assertRoundTripOK(JSONValueProtocol(), None, v)

Exemple #24

0

Afficher le fichier

Fichier : simple_wordcount.py Projet : Ericbaba/MIT_dataiap

import re
import sys
from collections import defaultdict
from mrjob.protocol import JSONValueProtocol
from term_tools import get_terms

input = open(sys.argv[1])
words = defaultdict(lambda: 0)
for line in input:
    email = JSONValueProtocol.read(line)[1]
    for term in get_terms(email['text']):
        words[term] += 1

for word, count in words.items():
    print word, count