Ejemplo n.º 1
0
    def test_uses_json_format(self):
        KEY = ['a', 1]
        VALUE = {'foo': 'bar'}
        ENCODED = b'["a", 1]\t{"foo": "bar"}'

        self.assertEqual((KEY, VALUE), JSONProtocol().read(ENCODED))
        self.assertEqual(ENCODED, JSONProtocol().write(KEY, VALUE))
Ejemplo n.º 2
0
    def test_uses_json_format(self):
        KEY = ['a', 1]
        VALUE = {'foo': {'bar': 3}, 'baz': None}
        ENCODED = '["a", 1]\t{"foo": {"bar": 3}, "baz": null}'

        self.assertEqual((KEY, VALUE), JSONProtocol().read(ENCODED))
        self.assertEqual(ENCODED, JSONProtocol().write(KEY, VALUE))
Ejemplo n.º 3
0
 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(({
         '1': 2
     }, {
         '3': 4
     }),
                      JSONProtocol().read(JSONProtocol().write({1: 2},
                                                               {3: 4})))
Ejemplo n.º 4
0
    def test_bad_keys_and_values(self):
        # dictionaries have to have strings as keys
        self.assertCantEncode(JSONProtocol(), {(1, 2): 3}, None)

        # only unicodes (or bytes in utf-8) are allowed
        self.assertCantEncode(JSONProtocol(), '0\xa2', '\xe9')

        # sets don't exist in JSON
        self.assertCantEncode(JSONProtocol(), set([1]), set())

        # Point class has no representation in JSON
        self.assertCantEncode(JSONProtocol(), Point(2, 3), Point(1, 4))
    def test_encode(self):
        linRegFactory = LinearRegressionFactory(11)
        linReg = linRegFactory.get_instance()
        encoded = linRegFactory.encode(linReg)

        protocol = JSONProtocol()
        print protocol.write(0, encoded)
def encode_node(node_id, links=None, score=1):
    node = {}
    if links:
        node['links'] = sorted(links.items())
    node['score'] = score
    x = JSONProtocol()
    return x.write(node_id, node) + '\n'
Ejemplo n.º 7
0
    def parse_output(self, protocol=None):
        """.. deprecated:: 0.4.2

        Parse the output from the given sandboxed job's ``self.stdout``.

        This was only useful for testing individual mappers/reducers
        without using a runner; normally you'd just use
        :py:meth:`runner.stream_output()
        <mrjob.runner.MRJobRunner.stream_output()>`

        :type protocol: protocol
        :param protocol: A protocol instance to use. Defaults to
                         ``JSONProtocol()``.
        """
        if self.stdout == sys.stdout:
            raise AssertionError('You must call sandbox() first;'
                                 ' parse_output() is for testing only.')

        log.warning(
            'parse_output() is deprecated and will be removed in v0.5.0')

        if protocol is None:
            protocol = JSONProtocol()

        lines = StringIO(self.stdout.getvalue())
        return [protocol.read(line) for line in lines]
Ejemplo n.º 8
0
 def test_encode(self):
     '''
     Test whether algorithm can be json encoded (used as mrjob internal protocol)
     '''
     layerSizes = [3,2,1]
     nnFactory = PredictionNNFactory(layerSizes)
     nn = nnFactory.get_instance()
     # encode
     obj_encoded = nnFactory.encode(nn)
     # call json protocol
     protocol = JSONProtocol()    
     protocol.write("test_decode", obj_encoded)
    def test_decode(self):
        linRegFactory = LinearRegressionFactory(11)
        linReg = linRegFactory.get_instance()
        obj_encoded = linRegFactory.encode(linReg)

        protocol = JSONProtocol()
        json_encoded = protocol.write(0, obj_encoded)
        obj_encoded = protocol.read(json_encoded)

        linRegArr = linRegFactory.decode([obj_encoded[1]])
        assert type(linRegArr) == list, "decoded not as a list"
        assert type(linRegArr[0]
                    ) == LinearRegression, "decoded not as LinearRegression"
Ejemplo n.º 10
0
 def test_decode(self):
     '''
     Test whether algorithm can be json encoded (used as mrjob internal protocol)
     '''
     layerSizes = [3,2,1]
     nnFactory = PredictionNNFactory(layerSizes)
     nn = nnFactory.get_instance()
     # encode
     obj_encoded = nnFactory.encode(nn)
     # call json protocol
     protocol = JSONProtocol()    
     json_encoded = protocol.write("test_decode", obj_encoded)
     obj_encoded = protocol.read(json_encoded)
     
     nnArr = nnFactory.decode([obj_encoded[1]])
     assert type(nnArr) == list, "decoded not as a list"
     assert type(nnArr[0]) == MultilayerPerceptron, "decoded not as LinearRegression"
Ejemplo n.º 11
0
#s3_input_path = "s3://joeloren//iceval_out//input//datasets//"
tmp_dir_out = "s3://joeloren/interim_out/"
tmp_dir_in = "s3://joeloren/interim_in/"
tmp_dir_in_relative = "interim_in/"
tmp_dir_out_relative = "interim_out/"

from mrjob.protocol import JSONValueProtocol, JSONProtocol
jvp = JSONValueProtocol()
jp = JSONProtocol()

from boto.s3.connection import S3Connection
import sys

c = S3Connection('AKIAI4OZ3HY56BTOHA3A',
                 '6isbkZjBM8kt3PIk53EXVIf76VOPxOH8rNleGc6B')

bucket = c.get_bucket("joeloren")
datasets_bucket = c.get_bucket('joel_datasets')
Ejemplo n.º 12
0
 def input_protocol(self):
     if self.options.job_to_run != 'stats':
         LOG.debug('Reading text input from cdx files')
         return RawValueProtocol()
     LOG.debug('Reading JSON input from count job')
     return JSONProtocol()
Ejemplo n.º 13
0
    ["Ut", "pulvinar", "lectus", "quis", "feugiat", "adipiscing"],
    ["Nunc", "vulputate", "mauris", "congue", "diam", "ultrices", "aliquet"],
    ["Nulla", "pharetra", "laoreet", "est", "quis", "vestibulum"],
    ["Quisque", "feugiat", "pharetra", "sagittis"],
    ["Phasellus", "nulla", "massa", "sodales", "a", "suscipit", "blandit", "facilisis", "eu", "augue"],
    ["Cras", "mi", "massa", "ullamcorper", "nec", "tristique", "at", "convallis", "quis", "eros"],
    ["Mauris", "non", "fermentum", "lacus", "vitae", "tristique", "tellus"],
    ["In", "volutpat", "metus", "augue", "nec", "laoreet", "ante", "hendrerit", "vitae"],
    ["Vivamus", "id", "lacus", "nec", "orci", "tristique", "vulputate"]
]

logging.basicConfig(level=logging.INFO)

mr_job = MRWordCounter()
## JSONValueProtocol doesn't need a key
#mr_job.stdin = [JSONValueProtocol().write(None, line) for line in TEXT]

## JSONProtocol wants also a key
mr_job.stdin = [JSONProtocol().write(linenum, line) for linenum, line in enumerate(TEXT)]

result = {}
with mr_job.make_runner() as runner:
        runner.run()
        for line in runner.stream_output():
            key, value = mr_job.parse_output_line(line)
            #print "Line: ", key, " Count: ", value
            result[key] = value

# Print the output in JSON
print json.dumps(result)
Ejemplo n.º 14
0
 def test_round_trip(self):
     for k, v in JSON_KEYS_AND_VALUES:
         self.assertRoundTripOK(JSONProtocol(), k, v)
Ejemplo n.º 15
0
    NUMBER_RE = re.compile(r"[-?\d']+")
    input_file = 'sample_input.txt'
    with open(input_file, 'r') as out_file:
        data = [x.split() for x in out_file.read().splitlines()]

    # print(data)
    nodes = {}
    for line in data:
        nodes[int(line[0])] = []  #Will be written as null

    for line in data:
        #Check for dangling nodes
        if line[1:] == []:
            nodes[int(line[0])] = []  #Will be written as null
        else:
            nodes[int(line[0])].append(int(line[1:][0]))
    # print('nodes',nodes)
    # unique_nodes = sorted(set(nodes), key = lambda ele: nodes.count(ele))
    # print(nodes)
    # print(unique_nodes)
    unique_node_count = len(nodes.keys())
    initial_pagerank = 1 / unique_node_count

    j = JSONProtocol()

    with open("preprocessed_" + input_file, "wb+") as out_file:
        j = JSONProtocol()
        for _id, adj in nodes.items():
            out_file.write(j.write(_id, (adj, initial_pagerank)))
            out_file.write('\n'.encode('utf-8'))
Ejemplo n.º 16
0
 def test_bad_data(self):
     self.assertCantDecode(JSONProtocol(), '{@#$@#!^&*$%^')
Ejemplo n.º 17
0
 def INTERNAL_PROTOCOL(self):
     return JSONProtocol()
Ejemplo n.º 18
0
 def test_round_trip_with_trailing_tab(self):
     for k, v in JSON_KEYS_AND_VALUES:
         self.assertRoundTripWithTrailingTabOK(JSONProtocol(), k, v)
Ejemplo n.º 19
0
 def INPUT_PROTOCOL(self):
     return JSONProtocol()
Ejemplo n.º 20
0
 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(([1, 2], [3, 4]),
                      JSONProtocol().read(JSONProtocol().write((1, 2),
                                                               (3, 4))))