Example #1
0
    def test_read_config(self):
        '''
        Tests :py:func:`bella.helper.read_config`
        '''

        dong_test_fp = 'tests/test_data/dong_test_data.txt'
        assert dong_test_fp in read_config('dong_data_test', CONFIG_FP)
        with self.assertRaises(ValueError,
                               msg='nothing here should not be in the '
                               'config.yaml'):
            read_config('nothing here', CONFIG_FP)
        test_config_name = Path('./doesnotexist')
        with self.assertRaises(FileNotFoundError,
                               msg='there should be no file named '
                               f'{test_config_name}'):
            read_config('test_data', test_config_name)
Example #2
0
def tweebo(texts):
    '''
    Given a list of Strings will tokenise, pos tag and then dependecy parse
    the text using `Tweebo <https://github.com/ikekonglp/TweeboParser>`_
    a Tweet specific parser.

    The Tweebo parser cannot handle no strings therefore a special empty string
    symbol is required.

    If one of the texts is an empty String then an empty list will be returned
    for that index of the returned list.

    :param texts: The texts that are to be parsed
    :type text: list
    :returns: A list of of a list of DependencyToken instances. A list per text \
    in the texts argument.
    :rtype: list
    '''
    def no_text(text):
        '''
        Given a String checks if it is empty if so returns an empty_token else
        the text that was given.

        :param text: Text to be checked
        :type text: String
        :returns: The text if it is not empty or empty token if it is.
        :rtype: String
        '''

        empty_token = '$$$EMPTY$$$'
        if text.strip() == '':
            return empty_token
        return text

    with tempfile.TemporaryDirectory() as working_dir:
        with tempfile.TemporaryDirectory() as temp_dir:
            text_file_path = os.path.join(temp_dir, 'text_file.txt')
            result_file_path = os.path.join(temp_dir, 'text_file.txt.predict')
            tweebo_dir = full_path(
                read_config('depdency_parsers')['tweebo_dir'])
            with open(text_file_path, 'w+') as text_file:
                for text in texts:
                    text = no_text(text)
                    text_file.write(text)
                    text_file.write('\n')
            run_script = os.path.join(tweebo_dir, 'python_run.sh')
            if subprocess.run(
                ['bash', run_script, text_file_path, working_dir]):
                with open(result_file_path, 'r') as result_file:
                    return tweebo_post_process(result_file.read())
            else:
                raise SystemError('Could not run the Tweebo run script {}'\
                                  .format(run_script))
Example #3
0
    def get_lexicon(self):
        '''
        Overrides :py:func@`bella.lexicons.Lexicon.get_lexicon`
        '''

        sentiment_folder = full_path(read_config('lexicons')['hu_liu'])
        cats = ['positive', 'negative']
        word_cat = []
        for cat in cats:
            file_path = os.path.join(sentiment_folder, '{}-words.txt'.format(cat))
            with open(file_path, 'r', encoding='cp1252') as senti_file:
                for line in senti_file:
                    if re.search('^;', line) or re.search(r'^\W+', line):
                        continue
                    line = line.strip()
                    word_cat.append((line.strip(), cat))
        return word_cat
Example #4
0
    def get_lexicon(self):
        '''
        Overrides :py:func:`bella.lexicons.Lexicon.get_lexicon`
        '''

        emotion_file_path = full_path(read_config('lexicons')['nrc_emotion'])
        word_cat = []

        with open(emotion_file_path, 'r', newline='') as emotion_file:
            tsv_reader = csv.reader(emotion_file, delimiter='\t')
            for row in tsv_reader:
                if len(row):
                    word = row[0]
                    cat = row[1]
                    association = int(row[2])
                    if association:
                        word_cat.append((word, cat))
        return word_cat
Example #5
0
def tweebo_install(tweebo_func):
    '''
    Python decorator that ensures that
    `TweeboParser <https://github.com/ikekonglp/TweeboParser>`_ is installed,
    before running the function it wraps. Returns the given function.

    :param tweebo_func: A function that uses the Tweebo Parser.
    :type tweebo_func: function
    :returns: The given function
    :rtype: function
    '''

    tweebo_dir = full_path(read_config('depdency_parsers')['tweebo_dir'])
    # If the models file exists then Tweebo has been installed or failed to
    # install
    tweebo_models = os.path.join(tweebo_dir, 'pretrained_models.tar.gz')
    if not os.path.isfile(tweebo_models):
        install_script = os.path.join(tweebo_dir, 'install.sh')
        subprocess.run(['bash', install_script])
    return tweebo_func
Example #6
0
 def get_lexicon(self):
     '''
     Overrides :py:func@`bella.lexicons.Lexicon.get_lexicon`
     '''
     mpqa_file_path = full_path(read_config('lexicons')['mpqa'])
     word_cats = []
     with open(mpqa_file_path, 'r') as mpqa_file:
         for line in mpqa_file:
             line = line.strip()
             if line:
                 key_values = {}
                 for data in line.split():
                     if '=' in data:
                         key, value = data.split('=')
                         key_values[key] = value
                 word = key_values['word1']
                 cat = key_values['priorpolarity']
                 if cat == 'weakneg':
                     cat = key_values['polarity']
                 word_cats.append((word, cat))
     return word_cats
Example #7
0
    def test_gensim_word2vec(self):
        '''
        Tests the :py:class:`bella.word_vectors.GensimVectors`
        '''

        # Test loading word vectors from a file
        vo_zhang = VoVectors(skip_conf=True)

        self.assertEqual(vo_zhang.vector_size, 100, msg='Vector size should be equal'\
                         ' to 100 not {}'.format(vo_zhang.vector_size))
        # Check zero vectors work for OOV words
        zero_vector = np.zeros(100)
        oov_word = 'thisssssdoesssssnotexists'
        oov_vector = vo_zhang.lookup_vector(oov_word)
        self.assertEqual(True, np.array_equal(oov_vector, zero_vector),
                         msg='This word {} should not exists and have a zero '\
                         'vector and not {}'.format(oov_word, oov_vector))
        # Check it does get word vectors
        the_vector = vo_zhang.lookup_vector('the')
        self.assertEqual(False,
                         np.array_equal(the_vector, zero_vector),
                         msg='The word `the` should have a non-zero vector.')

        with self.assertRaises(ValueError, msg='Should raise a value for any param'\
                               'that is not a String and this is a list'):
            vo_zhang.lookup_vector(['the'])

        # Check if the word, index and vector lookups match
        index_word = vo_zhang.index2word
        word_index = vo_zhang.word2index
        the_index = word_index['the']
        self.assertEqual('the', index_word[the_index], msg='index2word and '\
                         'word2index do not match for the word `the`')
        index_vector = vo_zhang.index2vector
        the_vectors_match = np.array_equal(index_vector[the_index],
                                           vo_zhang.lookup_vector('the'))
        self.assertEqual(True, the_vectors_match, msg='index2vector does not match'\
                         ' lookup_vector func for the word `the`')

        # Test the constructor
        test_file_path = 'this'
        with self.assertRaises(Exception, msg='The file path should have no saved '\
                               'word vector file {} and there is no training data'\
                               .format(test_file_path)):
            GensimVectors(test_file_path, 'fake data', model='word2vec')
        with self.assertRaises(Exception, msg='Should not accept neither no saved '\
                               'word vector model nor no training data'):
            GensimVectors(None, None, model='word2vec')
        with self.assertRaises(Exception, msg='Should only accept the following models'\
                               ' {}'.format(['word2vec', 'fasttext'])):
            GensimVectors(None, [['hello', 'how', 'are']],
                          model='nothing',
                          min_count=1)

        # Test creating vectors from data
        data_path = os.path.abspath(
            read_config('sherlock_holmes_test', CONFIG_FP))
        with open(data_path, 'r') as data:
            data = map(tokenisers.whitespace, data)
            with tempfile.NamedTemporaryFile() as temp_file:
                data_vector = GensimVectors(temp_file.name,
                                            data,
                                            model='word2vec',
                                            size=200,
                                            name='sherlock')
                d_vec_size = data_vector.vector_size
                self.assertEqual(d_vec_size, 200, msg='Vector size should be 200 not'\
                                 ' {}'.format(d_vec_size))
                sherlock_vec = data_vector.lookup_vector('sherlock')
                self.assertEqual(False,
                                 np.array_equal(zero_vector, sherlock_vec),
                                 msg='Sherlock should be a non-zero vector')
                # Test that it saved the trained model
                saved_vector = GensimVectors(temp_file.name,
                                             None,
                                             model='word2vec')
                s_vec_size = saved_vector.vector_size
                self.assertEqual(s_vec_size, 200, msg='Vector size should be 200 not'\
                                 ' {}'.format(s_vec_size))
                equal_sherlocks = np.array_equal(
                    sherlock_vec, saved_vector.lookup_vector('sherlock'))
                self.assertEqual(True, equal_sherlocks, msg='The saved model and '\
                                 'the trained model should have the same vectors')
                # Ensure the name attributes works
                self.assertEqual('sherlock', data_vector.name, msg='The name '\
                                 'of the instance should be sherlock and not {}'\
                                 .format(data_vector.name))
Example #8
0
    def test_dong(self):
        '''
        Tests :py:func:`bella.parsers.dong`
        '''
        def check_results(expected_results, test_results):
            '''
            Given the expected results and the results from the function beign
            tested it will test that they are both equal. It will return nothing
            but will test if they are correct else it fails the tests.

            :param expected_results: A list of dictionaries containing expected
            values
            :param test_results: A list of dictionaries containing results from
            the function that is being tested
            :type expected_results: list
            :type test_results: list
            :returns: Nothing but checks if the results are to be expected
            :rtype: None
            '''

            for index, expected_result in enumerate(expected_results):
                test_result = test_results[index]
                for key, expected_value in expected_result.items():
                    test_value = test_result[key]
                    self.assertIsInstance(expected_value, type(test_value),
                                          msg='The expected value : {} is not of the '\
                                          'same type as the tested value : {}'\
                                          .format(type(expected_value), type(test_value)))
                    if key == 'spans':
                        test_value = sorted(test_value, key=lambda x: x[0])
                        expected_value = sorted(expected_value,
                                                key=lambda x: x[0])

                    self.assertEqual(expected_value,
                                     test_value,
                                     msg='Expected {} returned {}'.format(
                                         expected_value, test_value))

        test_file_path = 'anything'
        with self.assertRaises(FileNotFoundError, msg='there should be no file named {}'\
                               .format(test_file_path)):
            dong(test_file_path)

        test_file_path = './tests/test_data/dong_test_data.txt'
        expected_results = [{
            'target_id': 'dong_test_data0',
            'sentence_id': 'dong_test_data0',
            'sentiment': -1,
            'text':
            'This is a fake news article that is to represent a Tweet!!!!',
            'target': 'news article',
            'spans': [(15, 27)]
        }, {
            'target_id': 'dong_test_data1',
            'sentence_id': 'dong_test_data1',
            'sentiment': 1,
            'text': 'I had a great day however I did not get much work done',
            'target': 'day',
            'spans': [(14, 17)]
        }, {
            'target_id': 'dong_test_data2',
            'sentence_id': 'dong_test_data2',
            'sentiment': 0,
            'text': 'I cycled in today and it was ok as it was not raining.',
            'target': 'cycled',
            'spans': [(2, 8)]
        }]
        check_results(expected_results, dong(test_file_path).data())

        bad_sent_path = read_config('dong_bad_sent_data_test', CONFIG_FP)
        with self.assertRaises(ValueError, msg='It should not accept sentiment '\
                               'values that are not 1, 0, or -1'):
            dong(bad_sent_path)

        # Ensure that it can handle the same target with multiple spans
        test_multiple_path = read_config('dong_multiple_offsets_data_test',
                                         CONFIG_FP)
        multi_expected = [{'target_id':'dong_test_multiple_offsets_data0',
                           'sentence_id':'dong_test_multiple_offsets_data0',
                           'sentiment':-1,
                           'text':'This is a fake news article that is to represent a '\
                           'Tweet!!!! and it was an awful News Article I think.',
                           'target':'news article',
                           'spans':[(15, 27), (81, 93)]},
                          {'target_id':'dong_test_multiple_offsets_data1',
                           'sentence_id':'dong_test_multiple_offsets_data1',
                           'sentiment':1,
                           'text':'I had a great Day however I did not get much '\
                           'work done in the day',
                           'target':'day',
                           'spans':[(14, 17), (62, 65)]}]
        check_results(multi_expected, dong(test_multiple_path).data())

        # Test that multi word targets that should have a space between them
        # are still detected
        test_mwe_path = read_config('dong_mwe_offsets_data_test', CONFIG_FP)
        mwe_expected = [{'target_id':'dong_test_mwe_offsets_data0',
                         'sentence_id':'dong_test_mwe_offsets_data0',
                         'sentiment':-1,
                         'text':'This is a fake news article that is to represent a '\
                         'Tweet!!!! and it was an awful NewsArticle I think.',
                         'target':'news article',
                         'spans':[(15, 27), (81, 92)]}]
        check_results(mwe_expected, dong(test_mwe_path).data())