Example #1
0
def dong(file_path):
    '''
    Given file path to the
    `Li Dong <https://github.com/bluemonk482/tdparse/tree/master/data/lidong>`_
    sentiment data it will parse the data and return it as a list of dictionaries.

    :param file_path: File Path to the annotated data
    :type file_path: String
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''

    file_path = os.path.abspath(file_path)
    if not os.path.isfile(file_path):
        raise FileNotFoundError('This file does not exist {}'.format(file_path))
    file_name, _ = os.path.splitext(os.path.basename(file_path))
    sentiment_range = [-1, 0, 1]

    sentiment_data = TargetCollection()
    with open(file_path, 'r') as dong_file:
        sent_dict = {}
        for index, line in enumerate(dong_file):
            divisible = index + 1
            line = line.strip()
            if divisible % 3 == 1:
                sent_dict['text'] = line
            elif divisible % 3 == 2:
                sent_dict['target'] = line
            elif divisible % 3 == 0:
                sentiment = int(line)
                if sentiment not in sentiment_range:
                    raise ValueError('The sentiment has to be one of the '\
                                     'following values {} not {}'\
                                     .format(sentiment_range, sentiment))
                sent_dict['sentiment'] = int(line)
                text = sent_dict['text'].lower()
                target = sent_dict['target'].lower()
                offsets = [match.span() for match in re.finditer(target, text)]
                if len(target.split()) > 1:
                    joined_target = ''.join(target.split())
                    offsets.extend([match.span()
                                    for match in re.finditer(joined_target, text)])
                sent_dict['spans'] = offsets
                sent_id = file_name + str(len(sentiment_data))
                # Sentence ID is the same as the target as there is only one
                # target per sentence
                sent_dict['sentence_id'] = sent_id
                sent_dict['target_id'] = sent_id
                sent_target = Target(**sent_dict)
                sentiment_data.add(sent_target)
                sent_dict = {}
            else:
                raise Exception('Problem')
    return sentiment_data
Example #2
0
    def test_targetcoll_add(self):
        '''
        Test the add function of TargetCollection
        '''

        target_col = TargetCollection()
        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        # Ensure the normal case works
        target_col.add(target_example_0)
        self.assertEqual(target_col['1'], target_example_0, msg='Test that {}' \
                         ' has been added to {}'\
                         .format(target_example_0, target_col))

        with self.assertRaises(TypeError, msg='Should not be able to add a dict'):
            target_col.add({'target_id' : '2'})

        with self.assertRaises(ValueError, msg='Should not be able to add a '\
                               'Target that has no `id`'):
            del target_example_1['target_id']
            if 'target_id' in target_example_1:
                raise KeyError('{} should not contain `id` key'\
                .format(target_example_1))
            target_col.add(target_example_1)
Example #3
0
    def test_targetcoll_data(self):
        '''
        Test the data function of TargetCollection
        '''

        target_col = TargetCollection()
        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_col.add(target_example_0)
        target_col.add(target_example_1)

        all_data = target_col.data()
        self.assertEqual(target_example_0, all_data[0], msg='First data '\
                         'returned should be the first inserted {} and not '\
                         '{}'.format(target_example_0, all_data[0]))
        self.assertEqual(target_example_1, all_data[1], msg='Second data '\
                         'returned should be the second inserted {} and not '\
                         '{}'.format(target_example_1, all_data[1]))

        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        del target_col['1']
        target_col.add(target_example_2)
        all_data = target_col.data()
        self.assertEqual(target_example_1, all_data[0], msg='First data '\
                         'returned should be the second inserted {} and not '\
                         '{} as the first has been removed'\
                         .format(target_example_1, all_data[0]))
        self.assertEqual(target_example_2, all_data[1], msg='Second data '\
                         'returned should be the third inserted {} and not '\
                         '{} as the first has been removed'\
                         .format(target_example_2, all_data[1]))
        self.assertEqual(2, len(all_data), msg='The length of the data returned'\
                         'shoudl be 2 and not {}'.format(len(all_data)))
Example #4
0
def _semeval_extract_data(sentences, file_name, conflict=False,
                          sentence_ids_skip=None):
    '''
    :param sentences: A `sentences` named element
    :param file_name: Name of the file being parsed
    :param conflict: Determine if to keep the target data that has a conflict \
    sentiment label.
    :param sentence_ids_skip: IDs of sentences that should be skipped
    :type sentences: xml.etree.ElementTree.Element
    :type file_name: String
    :type conflict: bool. Defailt False
    :type sentence_ids_skip: list. Default None
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''

    # Converts the sentiment tags from Strings to ints
    sentiment_mapper = {'conflict' : -2, 'negative' : -1,
                        'neutral' : 0, 'positive' : 1}

    def extract_aspect_terms(aspect_terms, sentence_id):
        '''
        :param aspect_terms: An aspectTerms element within the xml tree
        :param sentence_id: Id of the sentence that the aspects came from.
        :type aspect_terms: xml.etree.ElementTree.Element
        :type sentence_id: String
        :returns: A list of dictioanries containg id, span, sentiment and \
        target
        :rtype: list
        '''

        aspect_terms_data = []
        for index, aspect_term in enumerate(aspect_terms):
            aspect_term = aspect_term.attrib
            aspect_term_data = {}
            sentiment = sentiment_mapper[aspect_term['polarity']]
            if sentiment == -2 and not conflict:
                continue
            aspect_id = '{}{}'.format(sentence_id, index)
            aspect_term_data['target_id'] = aspect_id
            if 'term' in aspect_term:
                aspect_term_data['target'] = aspect_term['term']
            elif 'target' in aspect_term:
                aspect_term_data['target'] = aspect_term['target']
            else:
                raise KeyError('There is no `target` attribute in the opinions '\
                               'element {}'.format(aspect_term))
            aspect_term_data['sentiment'] = sentiment
            aspect_term_data['spans'] = [(int(aspect_term['from']),
                                          int(aspect_term['to']))]
            aspect_term_data['sentence_id'] = sentence_id
            # If the target is NULL then there is no target
            if aspect_term_data['target'] == 'NULL':
                continue
            aspect_terms_data.append(aspect_term_data)
        return aspect_terms_data

    def add_text(aspect_data, text):
        '''
        :param aspect_data: A list of dicts containing `span`, `target` and \
        `sentiment` keys.
        :param text: The text of the sentence that is associated to all of the \
        aspects in the aspect_data list
        :type aspect_data: list
        :type text: String
        :returns: The list of dicts in the aspect_data parameter but with a \
        `text` key with the value that the text parameter contains
        :rtype: list
        '''

        for data in aspect_data:
            data['text'] = text
        return aspect_data

    all_aspect_term_data = TargetCollection()
    for sentence in sentences:
        aspect_term_data = None
        text_index = None
        sentence_id = file_name + sentence.attrib['id']
        # Allow the parser to skip certain sentences
        if sentence_ids_skip is not None:
            if sentence.attrib['id'] in sentence_ids_skip:
                continue
        for index, data in enumerate(sentence):
            if data.tag == 'sentence':
                raise Exception(sentence.attrib['id'])
            if data.tag == 'text':
                text_index = index
            elif data.tag == 'aspectTerms' or data.tag == 'Opinions':
                aspect_term_data = extract_aspect_terms(data, sentence_id)
        if aspect_term_data is None:
            continue
        if text_index is None:
            raise ValueError('A semeval sentence should always have text '\
                             'semeval file {} sentence id {}'\
                             .format(file_name, sentence.attrib['id']))
        sentence_text = sentence[text_index].text
        aspect_term_data = add_text(aspect_term_data, sentence_text)
        for aspect in aspect_term_data:
            sent_target = Target(**aspect)
            all_aspect_term_data.add(sent_target)
    return all_aspect_term_data
Example #5
0
def hu_liu(file_path):
    '''
    Parser for the datasets from the following two papers (DOES NOT WORK):

    1. `A Holistic Lexicon-Based Approach to Opinion Mining \
    <https://www.cs.uic.edu/~liub/FBS/opinion-mining-final-WSDM.pdf>`_
    2. `Mining and Summarizing Customer Reviews \
    <https://www.cs.uic.edu/~liub/publications/kdd04-revSummary.pdf>`_

    Currently this does not work. This is due to the dataset not containing
    enough data to determine where the targets are in the text.

    :param file_path: The path to a file containing annotations in the format \
    of hu and liu sentiment datasets.
    :type file_path: String
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''
    file_path = os.path.abspath(file_path)
    file_name = os.path.basename(file_path)
    sentiment_data = TargetCollection()

    with open(file_path, 'r', encoding='cp1252') as annotations:
        for sentence_index, annotation in enumerate(annotations):
            # If it does not contain ## then not a sentence
            if '##' not in annotation:
                continue
            targets_text = annotation.split('##')
            if len(targets_text) > 2 or len(targets_text) < 1:
                raise ValueError('The annotation {} when split on `##` should '\
                                 'contain at least the sentence text and at'\
                                 ' most the text and the targets and not {}'\
                                 .format(annotation, targets_text))
            # If it just contains the sentence text then go to next
            elif len(targets_text) == 1:
                continue
            elif targets_text[0].strip() == '':
                continue
            targets, text = targets_text
            targets = targets.strip()
            text = text.strip()
            sentence_id = file_name + '#{}'.format(sentence_index)

            targets = targets.split(',')
            for target_index, target in enumerate(targets):
                target = target.strip()
                sentiment_match = re.search(r'\[[+-]\d\]$', target)
                is_implicit = re.search(r'\[[up]\]', target)
                if is_implicit:
                    print('Target {} is implicit {}'.format(target, text))
                    continue
                if not sentiment_match:
                    raise ValueError('Target {} does not have a corresponding'\
                                     ' sentiment value. annotation {}'\
                                     .format(target, annotation))
                target_text = target[:sentiment_match.start()].strip()
                sentiment_text = sentiment_match.group().strip().strip('[]')
                sentiment_value = int(sentiment_text)

                target_matches = list(re.finditer(target_text, text))
                if len(target_matches) != 1:
                    print('The Target {} can only occur once in the '\
                          'text {}'.format(target_text, text))
                    continue
                    raise ValueError('The Target {} can only occur once in the '\
                                     'text {}'.format(target_text, text))
                target_span = target_matches[0].span()
                target_id = sentence_id + '#{}'.format(target_index)

                data_dict = {}
                data_dict['spans'] = [target_span]
                data_dict['target'] = target_text
                data_dict['sentiment'] = sentiment_value
                data_dict['text'] = text
                data_dict['sentence_id'] = sentence_id
                data_dict['target_id'] = target_id
                sentiment_data.add(Target(**data_dict))
    return sentiment_data