class BootStrapTester(unittest.TestCase):

    def setUp(self):
        self.json_data = json.load(open("comment_data/jobs-1.json"))
        self.all_data = [v for k,v in self.json_data.iteritems()]
        self.slash_word = "60,000/yr"
        self.data = ["I am a lawyer.", "I am a doctor.", "I work as a lawyer.",
                     "I became a doctor.", "I am a nun."]
        self.seeds = ["doctor", "lawyer"]

        self.mbs = MutualBootStrapper(self.data, self.seeds)
        self.mbs.find_patterns()
        self.mbs.set_counter_arrays()
        self.mbs.find_seeds()
        self.pattern_index = self.mbs.pattern_alphabet.get_index(("am", "a", "<x>"))
        self.chunker = OpenNLPChunkerWrapper("/home/keelan/lib/apache-opennlp-1.5.3/bin/opennlp", \
                                             "/home/keelan/lib/en-chunker.bin")


    def test_nested_tokenize(self):
        single_sent = self.mbs._nested_tokenize(self.json_data["cfy2a30"])
        self.assertEqual(len(single_sent), 1)
        self.assertGreater(len(single_sent[0]), 1)
        multi_sents = self.mbs._nested_tokenize(self.json_data["cfy9b0t"])
        self.assertGreater(len(multi_sents), 1)

    def test_postprocess_tokenized(self):
        nested = [[self.slash_word]]
        self.mbs._postprocess_tokenized_text(nested)
        self.assertEqual(nested[0][0], "60,000 / yr")

    def test_pattern_finder(self):
        # print self.mbs.candidate_patterns
        self.assertGreater(self.mbs.pattern_alphabet.size(), 0)
        self.assertTrue(self.mbs.pattern_alphabet.has_label(("as", "a", "<x>")))

    def test_counter_arrays(self):
        self.assertGreater(self.mbs.n_pattern_array.size, 0)

    def test_find_seeds(self):
        self.assertEqual(self.mbs.n_pattern_array[self.pattern_index], 4)
        self.assertEqual(self.mbs.f_pattern_array[self.pattern_index], 2)
        self.assertTrue("nun" in self.mbs.candidate_seeds)
        self.assertTrue(self.pattern_index in self.mbs.candidate_seeds["nun"])

    def test_opennlp_chunker(self):
        tagged_sentence = [("Rockwell", "NNP"), ("said", "VBD"),
                           ("the", "DT"), ("agreement", "NN"), (".", ".")]

        chunked_sentence = self.chunker.chunk_sent(tagged_sentence)
        print chunked_sentence
        self.assertEqual(len(chunked_sentence), 0)
        self.assertTrue(isinstance(chunked_sentence[0], Chunk))
    def setUp(self):
        self.json_data = json.load(open("comment_data/jobs-1.json"))
        self.all_data = [v for k,v in self.json_data.iteritems()]
        self.slash_word = "60,000/yr"
        self.data = ["I am a lawyer.", "I am a doctor.", "I work as a lawyer.",
                     "I became a doctor.", "I am a nun."]
        self.seeds = ["doctor", "lawyer"]

        self.mbs = MutualBootStrapper(self.data, self.seeds)
        self.mbs.find_patterns()
        self.mbs.set_counter_arrays()
        self.mbs.find_seeds()
        self.pattern_index = self.mbs.pattern_alphabet.get_index(("am", "a", "<x>"))
        self.chunker = OpenNLPChunkerWrapper("/home/keelan/lib/apache-opennlp-1.5.3/bin/opennlp", \
                                             "/home/keelan/lib/en-chunker.bin")