Python MosesTokenizer.tokenize Examples, mosestokenizer.MosesTokenizer.tokenize Python Examples

Example #1

0

Show file

def own_bleu_score(predictions, references, max_order=4, smooth=False):
    '''
    reference_corpus = []
    prediction_corpus = []
    for instance_id, reference_sents in references.items():
        try:
            prediction_sent = predictions[instance_id]
        except KeyError:
            logging.error("Missing prediction for instance '%s'.", instance_id)
            sys.exit(EXIT_STATUS_PREDICTION_MISSING)

        del predictions[instance_id]

        prediction_corpus.append(prediction_sent)
        reference_corpus.append(reference_sents)

    if len(predictions) > 0:
        logging.error("Found %d extra predictions, for example: %s", len(predictions),
                      ", ".join(list(predictions.keys())[:3]))
        sys.exit(EXIT_STATUS_PREDICTIONS_EXTRA)

    reference_length = 0
    translation_length = 0
    scores = []
    counter = 0
    for (references, translation) in zip(reference_corpus, prediction_corpus):
        if counter <= 4:
            print("Referenz: ", references, "\nPrediction: ", translation, "\n")
        counter += 1
        scores.append(sentence_bleu(references, translation, weights=(0,0,0,1)))
    '''
    # to be able to load punkt tokenizer from local folder even if on cluster
    original_dir = os.getcwd()
    execution_dir = os.path.dirname(os.path.abspath(__file__))
    os.chdir(execution_dir)
    '''
    compl_ref = ""
    for ref in references:
        compl_ref += ref + " "
    
    references = nltk.word_tokenize(compl_ref)
    '''

    #predictions = nltk.word_tokenize(predictions[0].strip('.'))
    tokenizer = MosesTokenizer('en')
    predictions = tokenizer.tokenize(predictions[0].lower())
    references = [
        tokenizer.tokenize(reference.lower()) for reference in references
    ]
    # change directory back after nltk tokenizers have been applied
    os.chdir(original_dir)
    # original bleu score uses constant weights
    #print(references[0])
    #scores = corpus_bleu([references], [predictions])
    scores = sentence_bleu(references, predictions, weights=(0.33, 0.33, 0.33))
    return scores

Example #2

0

Show file

def read_sentence14_target(file_path, max_offset_len=83):
    tk = MosesTokenizer()
    with open(file_path, 'rb') as fopen:
        raw = fopen.read()
        root = etree.fromstring(raw)
        for sentence in root:
            example = dict()
            example["sentence"] = sentence.find('text').text.lower()

            # for RAN
            tokens = tk.tokenize(example['sentence'])

            terms = sentence.find('aspectTerms')
            if terms is None:
                continue
            example["aspect_sentiment"] = []
            example["left_right"] = []
            example['offset'] = []

            for c in terms:
                target = c.attrib['term'].lower()
                example["aspect_sentiment"].append(
                    (target, c.attrib['polarity']))

                # for td lstm
                left_index = int(c.attrib['from'])
                right_index = int(c.attrib['to'])
                example["left_right"].append(
                    (example['sentence'][:right_index],
                     example['sentence'][left_index:], c.attrib['polarity']))

                # for RAN
                left_word_offset = len(
                    tk.tokenize(example['sentence'][:left_index]))
                right_word_offset = len(
                    tk.tokenize(example['sentence'][right_index:]))
                token_index = list(range(len(tokens)))
                token_length = float(len(token_index))
                for i in range(len(tokens)):
                    if i < left_word_offset:
                        token_index[i] = 1 - (left_word_offset -
                                              token_index[i]) / token_length
                    elif i >= right_word_offset:
                        token_index[i] = 1 - (token_index[i] -
                                              (len(tokens) - right_word_offset)
                                              + 1) / token_length
                    else:
                        token_index[i] = 0
                token_index += [-1.] * (max_offset_len - len(tokens))
                example['offset'].append(
                    (token_index, target, c.attrib['polarity']))
            yield example

Example #3

0

Show file

def print_unrolled_stats(unrolled_data):
    counter = dict()
    sentiment_counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    aspects = set()
    for x in unrolled_data:
        aspects.add(x['aspect'])
    for a in aspects:
        counter[a] = defaultdict(int)
    for e in unrolled_data:
        counter[e['aspect']][e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))
    for aspect in sorted(counter.keys()):
        total = 0
        for sentiment in sorted(counter[aspect].keys()):
            print('# {}\t\t{}:\t{}'.format(aspect, sentiment,
                                           counter[aspect][sentiment]))
            total += counter[aspect][sentiment]
            sentiment_counter[sentiment] += counter[aspect][sentiment]
        counter[aspect]['total'] = total
        print('# {}\t\t{}:\t{}'.format(aspect, 'total', total))
        print()
    print(sentiment_counter)
    return counter

Example #4

0

Show file

def print_unrolled_stats_atsa(unrolled_data):
    counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    for e in unrolled_data:
        counter[e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))

    for sentiment in sorted(counter.keys()):
        print('#{}:\t{}'.format(sentiment, counter[sentiment]))

    return counter

Example #5

0

Show file

class MosesTokenizer(Tokenizer):
    def __init__(self):
        super().__init__()
        self._tokenizer = NLTKMosesTokenizer()
        self._detokenizer = MosesDetokenizer()

    def tokenize(self, sentence):
        return self._tokenizer.tokenize(sentence)

    def detokenize(self, tokens):
        """Unescape Moses punctuation tokens.

        Replaces escape sequences like &#91; with the original characters
        (such as '['), so they better align to the original text.
        """
        return [self._detokenizer.unescape_xml(t) for t in tokens]

Example #6

0

Show file

import sys

from mosestokenizer import MosesTokenizer

TOK = MosesTokenizer()

fi = open(sys.argv[1], "r")
fo = open(sys.argv[1] + ".moses", "w")

for line in fi:
    parts = line.strip().split("\t")
    old_toks = parts[0].split()
    new_toks = TOK.tokenize(parts[0])
    tags = parts[1].split()

    new_tags = []
    tag_counter = 0
    next_covered = 0
    for index, word in enumerate(new_toks):
        if next_covered > 0:
            next_covered -= 1
            continue

        if word == old_toks[tag_counter].replace("&", "&amp;").replace(
                "'", "&apos;"):
            new_tags.append(tags[tag_counter])
            tag_counter += 1
        else:
            for i in range(7):
                if word + "".join(
                        new_toks[index + 1:index + 1 + i +

Example #7

0

Show file

def read_sentence1516_target(file_path, max_offset_len=83):
    tk = MosesTokenizer()

    with open(file_path, 'rb') as fopen:
        raw = fopen.read()
        root = etree.fromstring(raw)
        for review_xml in root:
            sentences_xml = review_xml.find("sentences")
            for sentence_xml in sentences_xml:
                example = dict()
                example["sentence"] = sentence_xml.find('text').text.lower()

                # for RAN
                tokens = tk.tokenize(example['sentence'])

                opinions_xml = sentence_xml.find('Opinions')
                if opinions_xml is None:
                    continue
                example["aspect_sentiment"] = {}
                example['left_right'] = []
                example['offset'] = []

                for opinion_xml in opinions_xml:
                    target = opinion_xml.attrib["target"].lower()
                    if target == 'null':
                        continue
                    example["aspect_sentiment"][target] = opinion_xml.attrib[
                        "polarity"]

                    # for td lstm
                    left_index = int(opinion_xml.attrib['from'])
                    right_index = int(opinion_xml.attrib['to'])

                    example["left_right"].append(
                        (example['sentence'][:left_index],
                         example['sentence'][right_index:],
                         opinion_xml.attrib['polarity']))

                    # for RAN
                    left_word_offset = len(
                        tk.tokenize(example['sentence'][:left_index]))
                    right_word_offset = len(
                        tk.tokenize(example['sentence'][right_index:]))

                    token_index = list(range(len(tokens)))
                    token_length = float(len(token_index))
                    for i in range(len(tokens)):
                        if i < left_word_offset:
                            token_index[i] = 1 - (
                                left_word_offset -
                                token_index[i]) / token_length
                        elif i >= len(tokens) - right_word_offset:
                            token_index[i] = 1 - (
                                token_index[i] -
                                (len(tokens) - right_word_offset) +
                                1) / token_length
                        else:
                            token_index[i] = 0
                    token_index += [-1.] * (max_offset_len - len(tokens))
                    example['offset'].append(
                        (token_index, target, opinion_xml.attrib['polarity']))

                if len(example["aspect_sentiment"]) == 0:
                    continue
                yield example