Example #1
0
 def test_array_stream(self):
     arr = stream_array(tokenize(StringIO('[]')))
     self.assertListEqual([i for i in arr], [])
     arr = stream_array(tokenize(StringIO('["People", "Places", "Things"]')))
     self.assertListEqual([i for i in arr], ["People", "Places", "Things"])
     arr = stream_array(tokenize(StringIO('["Apples", "Bananas", ["Pears", "Limes"]]')))
     self.assertListEqual([i for i in arr], ["Apples", "Bananas", ["Pears", "Limes"]])
     arr = stream_array(tokenize(StringIO('["Apples", ["Pears", "Limes"], "Bananas"]')))
     self.assertListEqual([i for i in arr], ["Apples", ["Pears", "Limes"], "Bananas"])
Example #2
0
 def test_array_stream(self):
     arr = stream_array(tokenize(StringIO('[]')))
     self.assertListEqual([i for i in arr], [])
     arr = stream_array(tokenize(StringIO('["People", "Places", "Things"]')))
     self.assertListEqual([i for i in arr], ["People", "Places", "Things"])
     arr = stream_array(tokenize(StringIO('["Apples", "Bananas", ["Pears", "Limes"]]')))
     self.assertListEqual([i for i in arr], ["Apples", "Bananas", ["Pears", "Limes"]])
     arr = stream_array(tokenize(StringIO('["Apples", ["Pears", "Limes"], "Bananas"]')))
     self.assertListEqual([i for i in arr], ["Apples", ["Pears", "Limes"], "Bananas"])
     arr = stream_array(tokenize(StringIO('["Apples", {"key":"value"}, "Bananas"]')))
     self.assertListEqual([i for i in arr], ["Apples", {"key": "value"}, "Bananas"])
Example #3
0
    def test_sequence(self):
        result = [token for token in tokenize(StringIO("123 \"abc\":{}"))]
        self.assertEqual(result, [(2, 123), (1, 'abc'), (0, ':'), (0, '{'), (0, '}')])

        # Borrowed from http://en.wikipedia.org/wiki/JSON
        big_file = """{
          "firstName": "John",
          "lastName": "Smith",
          "isAlive": true,
          "age": 25,
          "height_cm": 167.6,
          "address": {
            "streetAddress": "21 2nd Street",
            "city": "New York",
            "state": "NY",
            "postalCode": "10021-3100"
          },
          "phoneNumbers": [
            {
              "type": "home",
              "number": "212 555-1234"
            },
            {
              "type": "office",
              "number": "646 555-4567"
            }
          ],
          "children": [],
          "spouse": null
        }"""
        result = [token for token in tokenize(StringIO(big_file))]
        expected = [(0, '{'), (1, 'firstName'), (0, ':'), (1, 'John'), (0, ','), (1, 'lastName'), (0, ':'),
                    (1, 'Smith'), (0, ','), (1, 'isAlive'), (0, ':'), (3, True), (0, ','), (1, 'age'), (0, ':'),
                    (2, 25), (0, ','), (1, 'height_cm'), (0, ':'), (2, 167.6), (0, ','), (1, 'address'), (0, ':'),
                    (0, '{'), (1, 'streetAddress'), (0, ':'), (1, '21 2nd Street'), (0, ','), (1, 'city'), (0, ':'),
                    (1, 'New York'), (0, ','), (1, 'state'), (0, ':'), (1, 'NY'), (0, ','), (1, 'postalCode'),
                    (0, ':'), (1, '10021-3100'), (0, '}'), (0, ','), (1, 'phoneNumbers'), (0, ':'), (0, '['), (0, '{'),
                    (1, 'type'), (0, ':'), (1, 'home'), (0, ','), (1, 'number'), (0, ':'), (1, '212 555-1234'),
                    (0, '}'), (0, ','), (0, '{'), (1, 'type'), (0, ':'), (1, 'office'), (0, ','), (1, 'number'),
                    (0, ':'), (1, '646 555-4567'), (0, '}'), (0, ']'), (0, ','), (1, 'children'), (0, ':'), (0, '['),
                    (0, ']'), (0, ','), (1, 'spouse'), (0, ':'), (4, None), (0, '}')]
        self.assertListEqual(result, expected)
        big_file_no_space = '{"firstName":"John","lastName":"Smith","isAlive":true,"age":25,"height_cm":167.6,"addres' \
                            's":{"streetAddress":"21 2nd Street","city":"New York","state":"NY","postalCode":"10021-3' \
                            '100"},"phoneNumbers":[{"type":"home","number":"212 555-1234"},{"type":"office","number":' \
                            '"646 555-4567"}],"children":[],"spouse":null}'
        result = [token for token in tokenize(StringIO(big_file_no_space))]
        self.assertListEqual(result, expected)
        result = [token for token in tokenize(StringIO("854.6,123"))]
        self.assertEqual(result, [(2, 854.6), (0, ','), (2, 123)])
        self.assertRaises(ValueError, self.tokenize_sequence, "123\"text\"")
        self.assertRaises(ValueError, self.tokenize_sequence, "23.9e10true")
        self.assertRaises(ValueError, self.tokenize_sequence, "\"test\"56")
Example #4
0
    def test_sequence(self):
        result = [token for token in tokenize(StringIO("123 \"abc\":{}"))]
        self.assertEqual(result, [(2, 123), (1, 'abc'), (0, ':'), (0, '{'), (0, '}')])

        # Borrowed from http://en.wikipedia.org/wiki/JSON
        big_file = """{
          "firstName": "John",
          "lastName": "Smith",
          "isAlive": true,
          "age": 25,
          "height_cm": 167.6,
          "address": {
            "streetAddress": "21 2nd Street",
            "city": "New York",
            "state": "NY",
            "postalCode": "10021-3100"
          },
          "phoneNumbers": [
            {
              "type": "home",
              "number": "212 555-1234"
            },
            {
              "type": "office",
              "number": "646 555-4567"
            }
          ],
          "children": [],
          "spouse": null
        }"""
        result = [token for token in tokenize(StringIO(big_file))]
        expected = [(0, '{'), (1, 'firstName'), (0, ':'), (1, 'John'), (0, ','), (1, 'lastName'), (0, ':'),
                    (1, 'Smith'), (0, ','), (1, 'isAlive'), (0, ':'), (3, True), (0, ','), (1, 'age'), (0, ':'),
                    (2, 25), (0, ','), (1, 'height_cm'), (0, ':'), (2, 167.6), (0, ','), (1, 'address'), (0, ':'),
                    (0, '{'), (1, 'streetAddress'), (0, ':'), (1, '21 2nd Street'), (0, ','), (1, 'city'), (0, ':'),
                    (1, 'New York'), (0, ','), (1, 'state'), (0, ':'), (1, 'NY'), (0, ','), (1, 'postalCode'),
                    (0, ':'), (1, '10021-3100'), (0, '}'), (0, ','), (1, 'phoneNumbers'), (0, ':'), (0, '['), (0, '{'),
                    (1, 'type'), (0, ':'), (1, 'home'), (0, ','), (1, 'number'), (0, ':'), (1, '212 555-1234'),
                    (0, '}'), (0, ','), (0, '{'), (1, 'type'), (0, ':'), (1, 'office'), (0, ','), (1, 'number'),
                    (0, ':'), (1, '646 555-4567'), (0, '}'), (0, ']'), (0, ','), (1, 'children'), (0, ':'), (0, '['),
                    (0, ']'), (0, ','), (1, 'spouse'), (0, ':'), (4, None), (0, '}')]
        self.assertListEqual(result, expected)
        big_file_no_space = '{"firstName":"John","lastName":"Smith","isAlive":true,"age":25,"height_cm":167.6,"addres' \
                            's":{"streetAddress":"21 2nd Street","city":"New York","state":"NY","postalCode":"10021-3' \
                            '100"},"phoneNumbers":[{"type":"home","number":"212 555-1234"},{"type":"office","number":' \
                            '"646 555-4567"}],"children":[],"spouse":null}'
        result = [token for token in tokenize(StringIO(big_file_no_space))]
        self.assertListEqual(result, expected)
        result = [token for token in tokenize(StringIO("854.6,123"))]
        self.assertEqual(result, [(2, 854.6), (0, ','), (2, 123)])
        self.assertRaises(ValueError, self.tokenize_sequence, "123\"text\"")
        self.assertRaises(ValueError, self.tokenize_sequence, "23.9e10true")
        self.assertRaises(ValueError, self.tokenize_sequence, "\"test\"56")
Example #5
0
 def assertStringEquals(self, expected, actual):
     token_list = [
         token for token in tokenize(StringIO('"{}"'.format(actual)))
     ]
     self.assertEqual(1, len(token_list))
     ttype, token = token_list[0]
     self.assertEqual(expected, token)
     self.assertEqual(ttype, TOKEN_TYPE.STRING)
Example #6
0
 def test_array_stream_of_documents_with_incomplete_json(self):
     arr = []
     with self.assertRaises(ValueError):
         messages = stream_array(
             tokenize(
                 StringIO(
                     '[{"key": "value"}, {"key": "value"}, {"INCOMPLETE')))
         for message in messages:
             arr.append(message)
     self.assertListEqual(arr, [{"key": "value"}, {"key": "value"}])
Example #7
0
def load_data(filename):
    print('Loading data from:', filename)
    x = []
    y = []
    nlp = spacy.load('en_core_web_sm')

    # noinspection SpellCheckingInspection
    def handle_message(item):
        x.append(item['question'])
        y.append(1)
        doc = nlp(item['answer'])
        first_sent = next(doc.sents)
        x.append(first_sent.string.strip())
        y.append(0)

    with open(filename, 'r') as f:
        print('Processing stream...')
        messages = stream_array(tokenize(f))
        for message in messages:
            handle_message(message)

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.1,
                                                        shuffle=True)
    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.1,
                                                      shuffle=True)
    train_df = pd.DataFrame({'x': x_train, 'y': y_train})
    val_df = pd.DataFrame({'x': x_val, 'y': y_val})
    test_df = pd.DataFrame({'x': x_test, 'y': y_test})
    classes = np.array([0, 1])
    print('Lengths Train: {}, Val: {}, Test: {}, Classes: {}'.format(
        len(train_df), len(val_df), len(test_df), len(classes)))
    return train_df, val_df, test_df, classes
Example #8
0
 def tokenize_sequence(self, string):
     return [token for token in tokenize(StringIO(string))]
Example #9
0
 def assertStringEquals(self, expected, actual):
     token_list = [token for token in tokenize(StringIO('"{}"'.format(actual)))]
     self.assertEqual(1, len(token_list))
     ttype, token = token_list[0]
     self.assertEqual(expected, token)
     self.assertEqual(ttype, TOKEN_TYPE.STRING)
Example #10
0
 def tokenize_sequence(self, string):
     return [token for token in tokenize(StringIO(string))]
Example #11
0
 def test_array_stream_of_values_with_extra_invalid_json(self):
     arr = stream_array(tokenize(StringIO('["People", "Places"], EXTRA')))
     self.assertListEqual(list(arr), ["People", "Places"])
Example #12
0
 def test_array_stream_of_values_with_incomplete_json(self):
     arr = stream_array(
         tokenize(StringIO('["People", "Places", "INCOMPLETE')))
     self.assertListEqual(list(arr), ["People", "Places"])
Example #13
0
 def test_array_stream_of_documents_with_extra_invalid_json(self):
     arr = stream_array(
         tokenize(StringIO('[{"key": "value"}, {"key": "value"}] EXTRA')))
     self.assertListEqual(list(arr), [{"key": "value"}, {"key": "value"}])
Example #14
0
 def test_array_stream_of_documents(self):
     arr = stream_array(
         tokenize(StringIO('[{"key": "value"}, {"key": "value"}]')))
     self.assertListEqual(list(arr), [{"key": "value"}, {"key": "value"}])