Ejemplos de ngrams en Python, ejemplos de utils.ngrams en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: weighted_ngram_match.py Proyecto: modit-team/MODIT

def modified_recall(references, hypothesis, n):
    """
    Calculate modified ngram recall.
    :param references: A list of reference translations.
    :type references: list(list(str))
    :param hypothesis: A hypothesis translation.
    :type hypothesis: list(str)
    :param n: The ngram order.
    :type n: int
    :return: BLEU's modified precision for the nth order ngram.
    :rtype: Fraction
    """
    # Extracts all ngrams in hypothesis
    # Set an empty Counter if hypothesis is empty.
    # pdb.set_trace()
    numerator = 0
    denominator = 0

    counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
    # Extract a union of references' counts.
    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
    max_counts = {}
    for reference_and_weights in references:
        reference = reference_and_weights[0]
        weights = reference_and_weights[1]
        reference_counts = (
            Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
        )
        # for ngram in reference_counts:
        #     max_counts[ngram] = max(max_counts.get(ngram, 0), counts[ngram])
        clipped_counts = {
            ngram: min(count, counts[ngram]) for ngram, count in reference_counts.items()
        }
        # reweight
        if n == 1 and len(weights) == len(reference_counts):
            def weighted_sum(weights, counts):
                sum_counts = 0
                for ngram, count in counts.items():
                    sum_counts += count * (weights[ngram[0]] if ngram[0] in weights else 1)
                return sum_counts

            numerator += weighted_sum(weights, clipped_counts)
            denominator += max(1, weighted_sum(weights, reference_counts))

        else:
            numerator += sum(clipped_counts.values())
            denominator += max(1, sum(reference_counts.values()))

        # # Assigns the intersection between hypothesis and references' counts.
        # clipped_counts = {
        #     ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
        # }

        # numerator += sum(clipped_counts.values())
        # # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
        # # Usually this happens when the ngram order is > len(reference).
        # denominator += max(1, sum(counts.values()))

    #return Fraction(numerator, denominator, _normalize=False)
    return numerator, denominator

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_utils.py Proyecto: ymichael/cs3245-hw

def test_ngrams_padding_left():
    text = 'hello'
    assert [('h',), ('e',), ('l',), ('l',), ('o',)] == \
        utils.ngrams(text, 1, padding_left=True)

    assert [(utils.START, 'h'), ('h', 'e'), ('e', 'l'), ('l', 'l'),
            ('l', 'o')] == \
        utils.ngrams(text, 2, padding_left=True)

    assert [(utils.START, utils.START, 'h'), (utils.START, 'h', 'e'),
            ('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o')] == \
        utils.ngrams(text, 3, padding_left=True)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_utils.py Proyecto: kaiserahmed/cs3245-hw

def test_ngrams_padding_right():
    text = 'hello'
    assert [('h',), ('e',), ('l',), ('l',), ('o',)] == \
        utils.ngrams(text, 1, padding_right=True)

    assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'),
            ('o', utils.END)] == \
        utils.ngrams(text, 2, padding_right=True)

    assert [('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o'),
            ('l', 'o', utils.END), ('o', utils.END, utils.END)] == \
        utils.ngrams(text, 3, padding_right=True)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_utils.py Proyecto: kaiserahmed/cs3245-hw

def test_ngrams_padding_left():
    text = 'hello'
    assert [('h',), ('e',), ('l',), ('l',), ('o',)] == \
        utils.ngrams(text, 1, padding_left=True)

    assert [(utils.START, 'h'), ('h', 'e'), ('e', 'l'), ('l', 'l'),
            ('l', 'o')] == \
        utils.ngrams(text, 2, padding_left=True)

    assert [(utils.START, utils.START, 'h'), (utils.START, 'h', 'e'),
            ('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o')] == \
        utils.ngrams(text, 3, padding_left=True)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_utils.py Proyecto: ymichael/cs3245-hw

def test_ngrams_padding_right():
    text = 'hello'
    assert [('h',), ('e',), ('l',), ('l',), ('o',)] == \
        utils.ngrams(text, 1, padding_right=True)

    assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'),
            ('o', utils.END)] == \
        utils.ngrams(text, 2, padding_right=True)

    assert [('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o'),
            ('l', 'o', utils.END), ('o', utils.END, utils.END)] == \
        utils.ngrams(text, 3, padding_right=True)

Ejemplo n.º 6

0

Mostrar archivo

 def generate_batch_text_grams(self):
     start = self.start
     end = self.end
     self.texts_train = np.array([])
     self.labels_train = np.array([])
     data_split = self.ids[start:end]
     for i in range(0, len(data_split)):
         ids_index = data_split[i][0].split(" ")
         id = int(ids_index[0])
         index = int(ids_index[1])
         labels = self.labels[index][0]
         split_labels = labels.split(" ")
         labels_temp = np.zeros(config.label_size)
         for j in range(1, len(split_labels)):
             try:
                 label_index = utils.find_label_index(split_labels[j])
                 labels_temp[label_index] = 1.0
             except ValueError:
                 print("Not have label: ", split_labels[j])
         self.labels_train = np.append(self.labels_train, labels_temp)
         text_name = str(id) + "newsML.xml"
         reuters = et.parse("data/rcv1-2/train-text/" + text_name,
                            et.XMLParser(encoding='ISO-8859-1')).getroot()
         temp_text = ""
         for text in reuters.findall("title"):
             #print(text.text)
             temp_text = temp_text + text.text  #.replace(" ", "")
         for text in reuters.findall("text"):
             for p in text.findall("p"):
                 temp_text = temp_text + p.text  #.replace(" ", "").replace("\t","")
         #print("ID TExt: ", id)
         temp_text = utils.ngrams(temp_text)
         self.texts_train = np.append(self.texts_train, temp_text)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: hmm.py Proyecto: kadarakos/microhmm

    def compute_counts(self, sentences, targets):
        """
        Load text and estimate parameters for bigram HMM

        :param path: path to data file
        """
        for i in range(len(sentences)):
            # When at the end of a sentence
            # update counts
            words = sentences[i]
            tags = targets[i]
            for i in range(len(tags)):
                self.emission_counts[(words[i],
                                      tags[i])] = self.emission_counts.get(
                                          (words[i], tags[i]), 0.0) + 1
            # Adding start to have estimate for P(t_i|START) usually denote pi
            pos_bigrams = ngrams(['<START>'] + tags, r=2, pad=False)
            for gram in pos_bigrams.values():
                # Update unigram counts
                if len(gram) == 1:
                    self.pos_unigram_counts[
                        gram[0]] = self.pos_unigram_counts.get(gram[0],
                                                               0.0) + 1
                else:
                    self.transition_counts[gram] = self.transition_counts.get(
                        gram, 0.0) + 1

Ejemplo n.º 8

0

Mostrar archivo

 def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
     """
     Smoothing method 6:
     Interpolates the maximum likelihood estimate of the precision *p_n* with
     a prior estimate *pi0*. The prior is estimated by assuming that the ratio
     between pn and pn−1 will be the same as that between pn−1 and pn−2; from
     Gao and He (2013) Training MRF-Based Phrase Translation Models using
     Gradient Ascent. In NAACL.
     """
     hyp_len = hyp_len if hyp_len else len(hypothesis)
     # This smoothing only works when p_1 and p_2 is non-zero.
     # Raise an error with an appropriate message when the input is too short
     # to use this smoothing technique.
     assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
     for i, p_i in enumerate(p_n):
         if i in [0, 1]:  # Skips the first 2 orders of ngrams.
             continue
         else:
             pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
             # No. of ngrams in translation that matches the reference.
             m = p_i.numerator
             # No. of ngrams in translation.
             l = sum(1 for _ in ngrams(hypothesis, i + 1))
             # Calculates the interpolated precision.
             p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
     return p_n

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_utils.py Proyecto: kaiserahmed/cs3245-hw

def test_ngrams_simple():
    text = 'hello'
    assert [('h',), ('e',), ('l',), ('l',), ('o',)] == utils.ngrams(text, 1)

    assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o')] == \
        utils.ngrams(text, 2)

    assert [('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o')] == \
        utils.ngrams(text, 3)

    text2 = 'hello world'
    assert [('h',), ('e',), ('l',), ('l',), ('o',), (' ',), ('w',), ('o',),
            ('r',), ('l',), ('d',)] == \
        utils.ngrams(text2, 1)

    assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'), ('o', ' '),
            (' ', 'w'), ('w', 'o'), ('o', 'r'), ('r', 'l'), ('l', 'd')] == \
        utils.ngrams(text2, 2)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: oracle_sent_compression.py Proyecto: shreydesai/cups

    def _score_compression(abs_list, doc_list, hyp_labels):
        abs_tokens = flatten(abs_list)

        doc_tokens = []
        assert len(doc_list) == len(hyp_labels)
        for (sent, mask) in zip(doc_list, hyp_labels):
            assert len(sent) == len(mask)
            for (token, label) in zip(sent, mask):
                if label == 0:
                    doc_tokens.append(token)

        doc_tokens = _preprocess(doc_tokens)
        doc_uni = set(ngrams(doc_tokens, 1))
        doc_bi = set(ngrams(doc_tokens, 2))

        rouge1 = compute_rouge(doc_uni, _abs_uni)
        rouge2 = compute_rouge(doc_bi, _abs_bi)

        return (rouge1, rouge2)

Ejemplo n.º 11

0

Mostrar archivo

def greedy_search(abs_list, doc_list, budget=3):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0

    abs_tokens = _rouge_clean(' '.join(flatten(abs_list))).split()
    sents = [_rouge_clean(' '.join(sent)).split() for sent in doc_list]
    hyp_unigrams = [ngrams(sent, 1) for sent in sents]
    hyp_bigrams = [ngrams(sent, 2) for sent in sents]
    ref_unigrams = ngrams(abs_tokens, 1)
    ref_bigrams = ngrams(abs_tokens, 2)

    selected_idxs = []

    for _ in range(budget):
        curr_max_rouge = max_rouge
        curr_id = -1

        for (i, sent) in enumerate(sents):
            if i in selected_idxs:
                continue

            candidate_idxs = selected_idxs + [i]
            candidate_unigrams = set.union(
                *[set(hyp_unigrams[idx]) for idx in candidate_idxs])
            candidate_bigrams = set.union(
                *[set(hyp_bigrams[idx]) for idx in candidate_idxs])

            rouge1 = approx_rouge(candidate_unigrams, ref_unigrams)
            rouge2 = approx_rouge(candidate_bigrams, ref_bigrams)
            rouge_score = rouge1 + rouge2
            if rouge_score > curr_max_rouge:
                curr_max_rouge = rouge_score
                curr_id = i

        if curr_id == -1:
            return (list(sorted(selected_idxs)), max_rouge)

        selected_idxs.append(curr_id)
        max_rouge = curr_max_rouge

    return (list(sorted(selected_idxs)), max_rouge)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_utils.py Proyecto: ymichael/cs3245-hw

def test_ngrams_simple():
    text = 'hello'
    assert [('h', ), ('e', ), ('l', ), ('l', ),
            ('o', )] == utils.ngrams(text, 1)

    assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o')] == \
        utils.ngrams(text, 2)

    assert [('h', 'e', 'l'), ('e', 'l', 'l'), ('l', 'l', 'o')] == \
        utils.ngrams(text, 3)

    text2 = 'hello world'
    assert [('h',), ('e',), ('l',), ('l',), ('o',), (' ',), ('w',), ('o',),
            ('r',), ('l',), ('d',)] == \
        utils.ngrams(text2, 1)

    assert [('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'), ('o', ' '),
            (' ', 'w'), ('w', 'o'), ('o', 'r'), ('r', 'l'), ('l', 'd')] == \
        utils.ngrams(text2, 2)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: data_preprocessing.py Proyecto: ngarneau/contextual-mimick

def preprocess_data(dataset,
                    embeddings_path,
                    topn,
                    min_cos_sim):
    path =  './' + dataset.dataset_name + '/examples/'
    embeddings = load_embeddings(embeddings_path)

    # Training part
    examples = set((ngram, ngram[1]) for sentence in dataset.get_train_sentences for ngram in ngrams(sentence) if ngram[1] in embeddings)
    save_examples(examples, path, 'examples')
    # examples = load_examples(path+'examples.pkl')
    na_dataset = PerClassDataset(examples)

    # Validation part
    valid_examples = set((ngram, ngram[1]) for sentence in dataset.get_valid_sentences for ngram in ngrams(sentence) if ngram[1] not in na_dataset and ngram[1] in embeddings)
    save_examples(valid_examples, path, 'valid_examples')
    valid_dataset = PerClassDataset(valid_examples)

    tr_val_dataset = na_dataset | valid_dataset

    # Test part
    test_examples = set((ngram, ngram[1]) for sentence in dataset.get_test_sentences for ngram in ngrams(
        sentence) if ngram[1] not in tr_val_dataset and ngram[1] in embeddings)
    save_examples(test_examples, path, 'test_examples')
    test_dataset = PerClassDataset(test_examples)

    # valid_examples = load_examples(path+'valid_examples.pkl')
    # test_examples = load_examples(path+'test_examples.pkl')
    save_examples(test_examples | valid_examples, path, 'valid_test_examples')

    # OOV part
    all_sentences = dataset.get_train_sentences + dataset.get_valid_sentences + dataset.get_test_sentences
    oov_examples = set((ngram, ngram[1]) for sentence in all_sentences for ngram in ngrams(sentence) if ngram[1] not in embeddings)
    save_examples(oov_examples, path, 'oov_examples')

    # Augmented part
    all_dataset = tr_val_dataset | test_dataset
    filter_cond = lambda label: label not in all_dataset

    augmented_examples = augment_data(examples, embeddings_path, filter_cond=filter_cond, topn=topn, min_cos_sim=min_cos_sim)    
    augmented_examples |= examples  # Union
    save_examples(augmented_examples, path, 'augmented_examples_topn{topn}_cos_sim{cs}'.format(topn=topn, cs=min_cos_sim))

Ejemplo n.º 14

0

Mostrar archivo

def get_mentions(dbr, question):
    tris = str_to_trigrams_dict(dbr.lower())
    sets = []
    best_guess = MinStore()
    current_mention = MentionSet()
    for index, word in enumerate(question.split(' ')):
        word_trigrams = ngrams(word.lower())
        prob = overlap_trigrams_score(tris, word_trigrams)
        if not current_mention.append(word, index, prob):
            mention = MentionSet(mention=[word], indexes=[index], prob=prob)
            best_guess.store(mention, prob)
            if not current_mention.is_empty():
                sets.append(current_mention)
            current_mention = MentionSet()

    if not current_mention.is_empty():
        sets.append(current_mention)

    if len(sets) == 0:
        # no match found, best guess time
        best_guess = best_guess.get_item()
        sets = [best_guess]

    return sets

Ejemplo n.º 15

0

Mostrar archivo

 def character_based_4gram(text):
     return utils.ngrams(text, 4, padding_left=True, padding_right=True)

Ejemplo n.º 16

0

Mostrar archivo

 def character_based_4gram(text):
     return utils.ngrams(text,
                         n,
                         padding_left=padding,
                         padding_right=padding)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: eval_pipeline.py Proyecto: shreydesai/cups

def trigram_overlap(x, y):
    x_tri = ngrams(x, 3)
    y_tri = ngrams(y, 3)
    return any(_y_tri in x_tri for _y_tri in y_tri)

Ejemplo n.º 18

0

Mostrar archivo

def test(test_dataset):
    model.load_state_dict(torch.load(args.ckpt_path))
    model.eval()

    test_loader = tqdm(
        load(test_dataset, args.batch_size, shuffle=False),
        ncols=100,
    )

    true_list = []
    pred_list = []

    avgs = []

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            batch_probs = F.softmax(model(batch), 1)
            batch_labels = batch['label_ids']
            for j in range(batch_probs.size(0)):
                probs = unpack(batch_probs[j, 1])
                labels = unpack(batch_labels[j])

                _probs = []

                for (prob, label) in zip(probs, labels):
                    if label != -1:
                        true_list.append(label)
                        pred_list.append(prob)

                        _probs.append(prob)

                abs_list = test_dataset.abs_lists[i][0]
                doc_list = test_dataset.doc_lists[i][0]
                node_list = test_dataset.node_lists[i]

                pred_label_list = [0] * len(doc_list)

                opt_nodes = []

                for (node, prob) in zip(node_list, _probs):
                    if prob > 0.7:
                        for optnode in node_list:
                            if optnode['group'] == node['group']:
                                opt_nodes.append(optnode)
                        for kk in range(node['start_index'],
                                        node['end_index']):
                            pred_label_list[kk] = 1
                for node in optnodes:
                    for kk in range(node['start_index'], node['end_index']):
                        pred_label_list[kk] = 1

                words = [
                    word for (word, lab) in zip(doc_list, pred_label_list)
                    if lab == 0
                ]

                abs_ngrams = set(ngrams(abs_list, 1))
                doc_ngrams = set(ngrams(words, 1))
                rouge = compute_rouge(abs_ngrams, doc_ngrams)
                avgs.append(rouge)

    results_dict = {'F1': np.mean(avgs)}
    return results_dict

Ejemplo n.º 19

0

Mostrar archivo

def modified_precision(references, hypothesis, n):
    """
    Calculate modified ngram precision.
    The normal precision method may lead to some wrong translations with
    high-precision, e.g., the translation, in which a word of reference
    repeats several times, has very high precision.
    This function only returns the Fraction object that contains the numerator
    and denominator necessary to calculate the corpus-level precision.
    To calculate the modified precision for a single pair of hypothesis and
    references, cast the Fraction object into a float.
    The famous "the the the ... " example shows that you can get BLEU precision
    by duplicating high frequency words.
        >>> reference1 = 'the cat is on the mat'.split()
        >>> reference2 = 'there is a cat on the mat'.split()
        >>> hypothesis1 = 'the the the the the the the'.split()
        >>> references = [reference1, reference2]
        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
        0.2857...
    In the modified n-gram precision, a reference word will be considered
    exhausted after a matching hypothesis word is identified, e.g.
        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
        ...               'ensures', 'that', 'the', 'military', 'will',
        ...               'forever', 'heed', 'Party', 'commands']
        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
        ...               'guarantees', 'the', 'military', 'forces', 'always',
        ...               'being', 'under', 'the', 'command', 'of', 'the',
        ...               'Party']
        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
        ...               'of', 'the', 'party']
        >>> hypothesis = 'of the'.split()
        >>> references = [reference1, reference2, reference3]
        >>> float(modified_precision(references, hypothesis, n=1))
        1.0
        >>> float(modified_precision(references, hypothesis, n=2))
        1.0
    An example of a normal machine translation hypothesis:
        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
        ...               'ensures', 'that', 'the', 'military', 'always',
        ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
        >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
        ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
        ...               'that', 'party', 'direct']
        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
        ...               'ensures', 'that', 'the', 'military', 'will',
        ...               'forever', 'heed', 'Party', 'commands']
        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
        ...               'guarantees', 'the', 'military', 'forces', 'always',
        ...               'being', 'under', 'the', 'command', 'of', 'the',
        ...               'Party']
        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
        ...               'of', 'the', 'party']
        >>> references = [reference1, reference2, reference3]
        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
        0.9444...
        >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
        0.5714...
        >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
        0.5882352941176471
        >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
        0.07692...
    :param references: A list of reference translations.
    :type references: list(list(str))
    :param hypothesis: A hypothesis translation.
    :type hypothesis: list(str)
    :param n: The ngram order.
    :type n: int
    :return: BLEU's modified precision for the nth order ngram.
    :rtype: Fraction
    """
    # Extracts all ngrams in hypothesis
    # Set an empty Counter if hypothesis is empty.

    counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
    # Extract a union of references' counts.
    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
    max_counts = {}
    for reference in references:
        reference_counts = (
            Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
        )
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    # Assigns the intersection between hypothesis and references' counts.
    clipped_counts = {
        ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
    }

    numerator = sum(clipped_counts.values())
    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
    # Usually this happens when the ngram order is > len(reference).
    denominator = max(1, sum(counts.values()))

    return Fraction(numerator, denominator, _normalize=False)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: oracle_sent_compression.py Proyecto: shreydesai/cups

def eval_compressions(abs_list, doc_list, doc_compressions):
    def _apply_compression(node, labels):
        _labels = copy.deepcopy(labels)
        for _i in range(node.start_index, node.end_index):
            assert 0 <= node.sent_index <= len(_labels)
            assert 0 <= _i <= len(_labels[node.sent_index])

            _labels[node.sent_index][_i] = 1  # Delete
        return _labels

    def _preprocess(tokens):
        return [_rouge_clean(token.lower()) for token in tokens]

    def _score_compression(abs_list, doc_list, hyp_labels):
        abs_tokens = flatten(abs_list)

        doc_tokens = []
        assert len(doc_list) == len(hyp_labels)
        for (sent, mask) in zip(doc_list, hyp_labels):
            assert len(sent) == len(mask)
            for (token, label) in zip(sent, mask):
                if label == 0:
                    doc_tokens.append(token)

        doc_tokens = _preprocess(doc_tokens)
        doc_uni = set(ngrams(doc_tokens, 1))
        doc_bi = set(ngrams(doc_tokens, 2))

        rouge1 = compute_rouge(doc_uni, _abs_uni)
        rouge2 = compute_rouge(doc_bi, _abs_bi)

        return (rouge1, rouge2)

    _abs_tokens = _preprocess(flatten(abs_list))
    _abs_uni = set(ngrams(_abs_tokens, 1))
    _abs_bi = set(ngrams(_abs_tokens, 2))

    compressions_list = []
    doc_labels = [[0 for _ in range(len(sent))] for sent in doc_list]
    base_rouge1, base_rouge2 = _score_compression(abs_list, doc_list,
                                                  doc_labels)
    base_rouge = base_rouge1 + base_rouge2

    for sent_compressions in doc_compressions:
        for i, node in enumerate(sent_compressions):
            if [node.sent_index, node.node_index] in compressions_list:
                continue

            hyp_labels = _apply_compression(node, doc_labels)
            mod_rouge1, mod_rouge2 = _score_compression(
                abs_list, doc_list, hyp_labels)
            mod_rouge = mod_rouge1 + mod_rouge2

            if mod_rouge > base_rouge:
                compressions_list.append([node.sent_index, node.node_index])

                # If a parent constituent gets deleted, then by definition, all child
                # constituents must also be deleted.
                for child_node in sent_compressions[i + 1:]:
                    if (node.start_index <= child_node.start_index
                            and child_node.end_index <= node.end_index):
                        compressions_list.append(
                            [child_node.sent_index, child_node.node_index])

    return compressions_list

Ejemplo n.º 21

0

Mostrar archivo

Archivo: build_test_analyse_LM.py Proyecto: kaiserahmed/cs3245-hw

 def character_based_4gram(text):
     return utils.ngrams(text, n, padding_left=padding, padding_right=padding)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: build_test_LM.py Proyecto: kaiserahmed/cs3245-hw

 def character_based_4gram(text):
     return utils.ngrams(text, 4, padding_left=True, padding_right=True)