def own_bleu_score(predictions, references, max_order=4, smooth=False): ''' reference_corpus = [] prediction_corpus = [] for instance_id, reference_sents in references.items(): try: prediction_sent = predictions[instance_id] except KeyError: logging.error("Missing prediction for instance '%s'.", instance_id) sys.exit(EXIT_STATUS_PREDICTION_MISSING) del predictions[instance_id] prediction_corpus.append(prediction_sent) reference_corpus.append(reference_sents) if len(predictions) > 0: logging.error("Found %d extra predictions, for example: %s", len(predictions), ", ".join(list(predictions.keys())[:3])) sys.exit(EXIT_STATUS_PREDICTIONS_EXTRA) reference_length = 0 translation_length = 0 scores = [] counter = 0 for (references, translation) in zip(reference_corpus, prediction_corpus): if counter <= 4: print("Referenz: ", references, "\nPrediction: ", translation, "\n") counter += 1 scores.append(sentence_bleu(references, translation, weights=(0,0,0,1))) ''' # to be able to load punkt tokenizer from local folder even if on cluster original_dir = os.getcwd() execution_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(execution_dir) ''' compl_ref = "" for ref in references: compl_ref += ref + " " references = nltk.word_tokenize(compl_ref) ''' #predictions = nltk.word_tokenize(predictions[0].strip('.')) tokenizer = MosesTokenizer('en') predictions = tokenizer.tokenize(predictions[0].lower()) references = [ tokenizer.tokenize(reference.lower()) for reference in references ] # change directory back after nltk tokenizers have been applied os.chdir(original_dir) # original bleu score uses constant weights #print(references[0]) #scores = corpus_bleu([references], [predictions]) scores = sentence_bleu(references, predictions, weights=(0.33, 0.33, 0.33)) return scores
def read_sentence14_target(file_path, max_offset_len=83): tk = MosesTokenizer() with open(file_path, 'rb') as fopen: raw = fopen.read() root = etree.fromstring(raw) for sentence in root: example = dict() example["sentence"] = sentence.find('text').text.lower() # for RAN tokens = tk.tokenize(example['sentence']) terms = sentence.find('aspectTerms') if terms is None: continue example["aspect_sentiment"] = [] example["left_right"] = [] example['offset'] = [] for c in terms: target = c.attrib['term'].lower() example["aspect_sentiment"].append( (target, c.attrib['polarity'])) # for td lstm left_index = int(c.attrib['from']) right_index = int(c.attrib['to']) example["left_right"].append( (example['sentence'][:right_index], example['sentence'][left_index:], c.attrib['polarity'])) # for RAN left_word_offset = len( tk.tokenize(example['sentence'][:left_index])) right_word_offset = len( tk.tokenize(example['sentence'][right_index:])) token_index = list(range(len(tokens))) token_length = float(len(token_index)) for i in range(len(tokens)): if i < left_word_offset: token_index[i] = 1 - (left_word_offset - token_index[i]) / token_length elif i >= right_word_offset: token_index[i] = 1 - (token_index[i] - (len(tokens) - right_word_offset) + 1) / token_length else: token_index[i] = 0 token_index += [-1.] * (max_offset_len - len(tokens)) example['offset'].append( (token_index, target, c.attrib['polarity'])) yield example
def print_unrolled_stats(unrolled_data): counter = dict() sentiment_counter = defaultdict(int) length_list = [] tk = MosesTokenizer() aspects = set() for x in unrolled_data: aspects.add(x['aspect']) for a in aspects: counter[a] = defaultdict(int) for e in unrolled_data: counter[e['aspect']][e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for aspect in sorted(counter.keys()): total = 0 for sentiment in sorted(counter[aspect].keys()): print('# {}\t\t{}:\t{}'.format(aspect, sentiment, counter[aspect][sentiment])) total += counter[aspect][sentiment] sentiment_counter[sentiment] += counter[aspect][sentiment] counter[aspect]['total'] = total print('# {}\t\t{}:\t{}'.format(aspect, 'total', total)) print() print(sentiment_counter) return counter
def print_unrolled_stats_atsa(unrolled_data): counter = defaultdict(int) length_list = [] tk = MosesTokenizer() for e in unrolled_data: counter[e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for sentiment in sorted(counter.keys()): print('#{}:\t{}'.format(sentiment, counter[sentiment])) return counter
class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = NLTKMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens]
import sys from mosestokenizer import MosesTokenizer TOK = MosesTokenizer() fi = open(sys.argv[1], "r") fo = open(sys.argv[1] + ".moses", "w") for line in fi: parts = line.strip().split("\t") old_toks = parts[0].split() new_toks = TOK.tokenize(parts[0]) tags = parts[1].split() new_tags = [] tag_counter = 0 next_covered = 0 for index, word in enumerate(new_toks): if next_covered > 0: next_covered -= 1 continue if word == old_toks[tag_counter].replace("&", "&").replace( "'", "'"): new_tags.append(tags[tag_counter]) tag_counter += 1 else: for i in range(7): if word + "".join( new_toks[index + 1:index + 1 + i +
def read_sentence1516_target(file_path, max_offset_len=83): tk = MosesTokenizer() with open(file_path, 'rb') as fopen: raw = fopen.read() root = etree.fromstring(raw) for review_xml in root: sentences_xml = review_xml.find("sentences") for sentence_xml in sentences_xml: example = dict() example["sentence"] = sentence_xml.find('text').text.lower() # for RAN tokens = tk.tokenize(example['sentence']) opinions_xml = sentence_xml.find('Opinions') if opinions_xml is None: continue example["aspect_sentiment"] = {} example['left_right'] = [] example['offset'] = [] for opinion_xml in opinions_xml: target = opinion_xml.attrib["target"].lower() if target == 'null': continue example["aspect_sentiment"][target] = opinion_xml.attrib[ "polarity"] # for td lstm left_index = int(opinion_xml.attrib['from']) right_index = int(opinion_xml.attrib['to']) example["left_right"].append( (example['sentence'][:left_index], example['sentence'][right_index:], opinion_xml.attrib['polarity'])) # for RAN left_word_offset = len( tk.tokenize(example['sentence'][:left_index])) right_word_offset = len( tk.tokenize(example['sentence'][right_index:])) token_index = list(range(len(tokens))) token_length = float(len(token_index)) for i in range(len(tokens)): if i < left_word_offset: token_index[i] = 1 - ( left_word_offset - token_index[i]) / token_length elif i >= len(tokens) - right_word_offset: token_index[i] = 1 - ( token_index[i] - (len(tokens) - right_word_offset) + 1) / token_length else: token_index[i] = 0 token_index += [-1.] * (max_offset_len - len(tokens)) example['offset'].append( (token_index, target, opinion_xml.attrib['polarity'])) if len(example["aspect_sentiment"]) == 0: continue yield example