def dong(file_path): ''' Given file path to the `Li Dong <https://github.com/bluemonk482/tdparse/tree/master/data/lidong>`_ sentiment data it will parse the data and return it as a list of dictionaries. :param file_path: File Path to the annotated data :type file_path: String :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' file_path = os.path.abspath(file_path) if not os.path.isfile(file_path): raise FileNotFoundError('This file does not exist {}'.format(file_path)) file_name, _ = os.path.splitext(os.path.basename(file_path)) sentiment_range = [-1, 0, 1] sentiment_data = TargetCollection() with open(file_path, 'r') as dong_file: sent_dict = {} for index, line in enumerate(dong_file): divisible = index + 1 line = line.strip() if divisible % 3 == 1: sent_dict['text'] = line elif divisible % 3 == 2: sent_dict['target'] = line elif divisible % 3 == 0: sentiment = int(line) if sentiment not in sentiment_range: raise ValueError('The sentiment has to be one of the '\ 'following values {} not {}'\ .format(sentiment_range, sentiment)) sent_dict['sentiment'] = int(line) text = sent_dict['text'].lower() target = sent_dict['target'].lower() offsets = [match.span() for match in re.finditer(target, text)] if len(target.split()) > 1: joined_target = ''.join(target.split()) offsets.extend([match.span() for match in re.finditer(joined_target, text)]) sent_dict['spans'] = offsets sent_id = file_name + str(len(sentiment_data)) # Sentence ID is the same as the target as there is only one # target per sentence sent_dict['sentence_id'] = sent_id sent_dict['target_id'] = sent_id sent_target = Target(**sent_dict) sentiment_data.add(sent_target) sent_dict = {} else: raise Exception('Problem') return sentiment_data
def test_targetcoll_add(self): ''' Test the add function of TargetCollection ''' target_col = TargetCollection() target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) # Ensure the normal case works target_col.add(target_example_0) self.assertEqual(target_col['1'], target_example_0, msg='Test that {}' \ ' has been added to {}'\ .format(target_example_0, target_col)) with self.assertRaises(TypeError, msg='Should not be able to add a dict'): target_col.add({'target_id' : '2'}) with self.assertRaises(ValueError, msg='Should not be able to add a '\ 'Target that has no `id`'): del target_example_1['target_id'] if 'target_id' in target_example_1: raise KeyError('{} should not contain `id` key'\ .format(target_example_1)) target_col.add(target_example_1)
def test_targetcoll_data(self): ''' Test the data function of TargetCollection ''' target_col = TargetCollection() target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_col.add(target_example_0) target_col.add(target_example_1) all_data = target_col.data() self.assertEqual(target_example_0, all_data[0], msg='First data '\ 'returned should be the first inserted {} and not '\ '{}'.format(target_example_0, all_data[0])) self.assertEqual(target_example_1, all_data[1], msg='Second data '\ 'returned should be the second inserted {} and not '\ '{}'.format(target_example_1, all_data[1])) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) del target_col['1'] target_col.add(target_example_2) all_data = target_col.data() self.assertEqual(target_example_1, all_data[0], msg='First data '\ 'returned should be the second inserted {} and not '\ '{} as the first has been removed'\ .format(target_example_1, all_data[0])) self.assertEqual(target_example_2, all_data[1], msg='Second data '\ 'returned should be the third inserted {} and not '\ '{} as the first has been removed'\ .format(target_example_2, all_data[1])) self.assertEqual(2, len(all_data), msg='The length of the data returned'\ 'shoudl be 2 and not {}'.format(len(all_data)))
def _semeval_extract_data(sentences, file_name, conflict=False, sentence_ids_skip=None): ''' :param sentences: A `sentences` named element :param file_name: Name of the file being parsed :param conflict: Determine if to keep the target data that has a conflict \ sentiment label. :param sentence_ids_skip: IDs of sentences that should be skipped :type sentences: xml.etree.ElementTree.Element :type file_name: String :type conflict: bool. Defailt False :type sentence_ids_skip: list. Default None :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' # Converts the sentiment tags from Strings to ints sentiment_mapper = {'conflict' : -2, 'negative' : -1, 'neutral' : 0, 'positive' : 1} def extract_aspect_terms(aspect_terms, sentence_id): ''' :param aspect_terms: An aspectTerms element within the xml tree :param sentence_id: Id of the sentence that the aspects came from. :type aspect_terms: xml.etree.ElementTree.Element :type sentence_id: String :returns: A list of dictioanries containg id, span, sentiment and \ target :rtype: list ''' aspect_terms_data = [] for index, aspect_term in enumerate(aspect_terms): aspect_term = aspect_term.attrib aspect_term_data = {} sentiment = sentiment_mapper[aspect_term['polarity']] if sentiment == -2 and not conflict: continue aspect_id = '{}{}'.format(sentence_id, index) aspect_term_data['target_id'] = aspect_id if 'term' in aspect_term: aspect_term_data['target'] = aspect_term['term'] elif 'target' in aspect_term: aspect_term_data['target'] = aspect_term['target'] else: raise KeyError('There is no `target` attribute in the opinions '\ 'element {}'.format(aspect_term)) aspect_term_data['sentiment'] = sentiment aspect_term_data['spans'] = [(int(aspect_term['from']), int(aspect_term['to']))] aspect_term_data['sentence_id'] = sentence_id # If the target is NULL then there is no target if aspect_term_data['target'] == 'NULL': continue aspect_terms_data.append(aspect_term_data) return aspect_terms_data def add_text(aspect_data, text): ''' :param aspect_data: A list of dicts containing `span`, `target` and \ `sentiment` keys. :param text: The text of the sentence that is associated to all of the \ aspects in the aspect_data list :type aspect_data: list :type text: String :returns: The list of dicts in the aspect_data parameter but with a \ `text` key with the value that the text parameter contains :rtype: list ''' for data in aspect_data: data['text'] = text return aspect_data all_aspect_term_data = TargetCollection() for sentence in sentences: aspect_term_data = None text_index = None sentence_id = file_name + sentence.attrib['id'] # Allow the parser to skip certain sentences if sentence_ids_skip is not None: if sentence.attrib['id'] in sentence_ids_skip: continue for index, data in enumerate(sentence): if data.tag == 'sentence': raise Exception(sentence.attrib['id']) if data.tag == 'text': text_index = index elif data.tag == 'aspectTerms' or data.tag == 'Opinions': aspect_term_data = extract_aspect_terms(data, sentence_id) if aspect_term_data is None: continue if text_index is None: raise ValueError('A semeval sentence should always have text '\ 'semeval file {} sentence id {}'\ .format(file_name, sentence.attrib['id'])) sentence_text = sentence[text_index].text aspect_term_data = add_text(aspect_term_data, sentence_text) for aspect in aspect_term_data: sent_target = Target(**aspect) all_aspect_term_data.add(sent_target) return all_aspect_term_data
def hu_liu(file_path): ''' Parser for the datasets from the following two papers (DOES NOT WORK): 1. `A Holistic Lexicon-Based Approach to Opinion Mining \ <https://www.cs.uic.edu/~liub/FBS/opinion-mining-final-WSDM.pdf>`_ 2. `Mining and Summarizing Customer Reviews \ <https://www.cs.uic.edu/~liub/publications/kdd04-revSummary.pdf>`_ Currently this does not work. This is due to the dataset not containing enough data to determine where the targets are in the text. :param file_path: The path to a file containing annotations in the format \ of hu and liu sentiment datasets. :type file_path: String :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' file_path = os.path.abspath(file_path) file_name = os.path.basename(file_path) sentiment_data = TargetCollection() with open(file_path, 'r', encoding='cp1252') as annotations: for sentence_index, annotation in enumerate(annotations): # If it does not contain ## then not a sentence if '##' not in annotation: continue targets_text = annotation.split('##') if len(targets_text) > 2 or len(targets_text) < 1: raise ValueError('The annotation {} when split on `##` should '\ 'contain at least the sentence text and at'\ ' most the text and the targets and not {}'\ .format(annotation, targets_text)) # If it just contains the sentence text then go to next elif len(targets_text) == 1: continue elif targets_text[0].strip() == '': continue targets, text = targets_text targets = targets.strip() text = text.strip() sentence_id = file_name + '#{}'.format(sentence_index) targets = targets.split(',') for target_index, target in enumerate(targets): target = target.strip() sentiment_match = re.search(r'\[[+-]\d\]$', target) is_implicit = re.search(r'\[[up]\]', target) if is_implicit: print('Target {} is implicit {}'.format(target, text)) continue if not sentiment_match: raise ValueError('Target {} does not have a corresponding'\ ' sentiment value. annotation {}'\ .format(target, annotation)) target_text = target[:sentiment_match.start()].strip() sentiment_text = sentiment_match.group().strip().strip('[]') sentiment_value = int(sentiment_text) target_matches = list(re.finditer(target_text, text)) if len(target_matches) != 1: print('The Target {} can only occur once in the '\ 'text {}'.format(target_text, text)) continue raise ValueError('The Target {} can only occur once in the '\ 'text {}'.format(target_text, text)) target_span = target_matches[0].span() target_id = sentence_id + '#{}'.format(target_index) data_dict = {} data_dict['spans'] = [target_span] data_dict['target'] = target_text data_dict['sentiment'] = sentiment_value data_dict['text'] = text data_dict['sentence_id'] = sentence_id data_dict['target_id'] = target_id sentiment_data.add(Target(**data_dict)) return sentiment_data