Example #1
0
 def test_array_stream(self):
     arr = stream_array(tokenize(StringIO('[]')))
     self.assertListEqual([i for i in arr], [])
     arr = stream_array(tokenize(StringIO('["People", "Places", "Things"]')))
     self.assertListEqual([i for i in arr], ["People", "Places", "Things"])
     arr = stream_array(tokenize(StringIO('["Apples", "Bananas", ["Pears", "Limes"]]')))
     self.assertListEqual([i for i in arr], ["Apples", "Bananas", ["Pears", "Limes"]])
     arr = stream_array(tokenize(StringIO('["Apples", ["Pears", "Limes"], "Bananas"]')))
     self.assertListEqual([i for i in arr], ["Apples", ["Pears", "Limes"], "Bananas"])
Example #2
0
 def test_array_stream(self):
     arr = stream_array(tokenize(StringIO('[]')))
     self.assertListEqual([i for i in arr], [])
     arr = stream_array(tokenize(StringIO('["People", "Places", "Things"]')))
     self.assertListEqual([i for i in arr], ["People", "Places", "Things"])
     arr = stream_array(tokenize(StringIO('["Apples", "Bananas", ["Pears", "Limes"]]')))
     self.assertListEqual([i for i in arr], ["Apples", "Bananas", ["Pears", "Limes"]])
     arr = stream_array(tokenize(StringIO('["Apples", ["Pears", "Limes"], "Bananas"]')))
     self.assertListEqual([i for i in arr], ["Apples", ["Pears", "Limes"], "Bananas"])
     arr = stream_array(tokenize(StringIO('["Apples", {"key":"value"}, "Bananas"]')))
     self.assertListEqual([i for i in arr], ["Apples", {"key": "value"}, "Bananas"])
Example #3
0
 def test_array_stream_of_documents_with_incomplete_json(self):
     arr = []
     with self.assertRaises(ValueError):
         messages = stream_array(
             tokenize(
                 StringIO(
                     '[{"key": "value"}, {"key": "value"}, {"INCOMPLETE')))
         for message in messages:
             arr.append(message)
     self.assertListEqual(arr, [{"key": "value"}, {"key": "value"}])
Example #4
0
def load_data(filename):
    print('Loading data from:', filename)
    x = []
    y = []
    nlp = spacy.load('en_core_web_sm')

    # noinspection SpellCheckingInspection
    def handle_message(item):
        x.append(item['question'])
        y.append(1)
        doc = nlp(item['answer'])
        first_sent = next(doc.sents)
        x.append(first_sent.string.strip())
        y.append(0)

    with open(filename, 'r') as f:
        print('Processing stream...')
        messages = stream_array(tokenize(f))
        for message in messages:
            handle_message(message)

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.1,
                                                        shuffle=True)
    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.1,
                                                      shuffle=True)
    train_df = pd.DataFrame({'x': x_train, 'y': y_train})
    val_df = pd.DataFrame({'x': x_val, 'y': y_val})
    test_df = pd.DataFrame({'x': x_test, 'y': y_test})
    classes = np.array([0, 1])
    print('Lengths Train: {}, Val: {}, Test: {}, Classes: {}'.format(
        len(train_df), len(val_df), len(test_df), len(classes)))
    return train_df, val_df, test_df, classes
Example #5
0
 def test_array_stream_of_values_with_extra_invalid_json(self):
     arr = stream_array(tokenize(StringIO('["People", "Places"], EXTRA')))
     self.assertListEqual(list(arr), ["People", "Places"])
Example #6
0
 def test_array_stream_of_values_with_incomplete_json(self):
     arr = stream_array(
         tokenize(StringIO('["People", "Places", "INCOMPLETE')))
     self.assertListEqual(list(arr), ["People", "Places"])
Example #7
0
 def test_array_stream_of_documents_with_extra_invalid_json(self):
     arr = stream_array(
         tokenize(StringIO('[{"key": "value"}, {"key": "value"}] EXTRA')))
     self.assertListEqual(list(arr), [{"key": "value"}, {"key": "value"}])
Example #8
0
 def test_array_stream_of_documents(self):
     arr = stream_array(
         tokenize(StringIO('[{"key": "value"}, {"key": "value"}]')))
     self.assertListEqual(list(arr), [{"key": "value"}, {"key": "value"}])