コード例 #1
0
 def bigrams(self):
     """Return a dictionary { ("w1", "w2"): NUMBER_OF_OCCURENCES, ... }"""
     result = {}
     for i in range(len(self.tokens()) - 1):
         bigram = (self.tokens()[i], self.tokens()[i + 1])
         sparse_dict_increment(result, bigram)
     return result
コード例 #2
0
ファイル: __init__.py プロジェクト: gal-star/translationese
 def bigrams(self):
     """Return a dictionary { ("w1", "w2"): NUMBER_OF_OCCURENCES, ... }"""
     result = {}
     for i in range(len(self.tokens()) - 1):
         bigram = (self.tokens()[i], self.tokens()[i + 1])
         sparse_dict_increment(result, bigram)
     return result
コード例 #3
0
    def quantify(self, analysis):
        """Quantify usage of cohesive markers."""
        result = {}

        if analysis.lang == 'en':
            tokenized_markers = [(marker, nltk.word_tokenize(marker))
                                 for marker in COHESIVE_MARKERS]
            text = analysis.tokens()

            for i, _ in enumerate(text):
                for (marker, tokenized) in tokenized_markers:
                    if (tokenized == text[i:i + len(tokenized)]):
                        sparse_dict_increment(result, marker)

        elif analysis.lang == 'zh':
            if self.k == 0:
                markers = COHESIVE_MARKERS_ZH
            elif self.k == 1:
                markers = COHESIVE_MARKERS_CHEN_2006
            elif self.k == 2:
                markers = ["即", "也就是说"]
            elif self.k == 3:  # top 5 from Chen's list, using GainRatio from weka.
                markers = ["但是", "因为", "据说", "那么", "如果"]
            elif self.k == 4:  # adversative markers 转折词
                markers = ADVERSATIVES

            text = analysis.tokens()

            for i, _ in enumerate(text):
                for marker in markers:  # marker = '不过'
                    # marker can be segmented to at most len(marker) parts
                    # for j in range(1, len(marker)+1):
                    # But we assume marker can be segmented to at most 3 parts
                    for j in range(1, min(len(marker) + 1, 4)):
                        if (marker == ''.join(text[i:i + j])):
                            sparse_dict_increment(result, marker)
            ''' # old
            assert len(COHESIVE_MARKERS_ZH) == len(COHESIVE_MARKERS_ZH_SEG)

            # [('不过', ['不过']), ... ]
            tokenized_markers = \
                [(COHESIVE_MARKERS_ZH[i], COHESIVE_MARKERS_ZH_SEG[i]) \
                for i in range(len(COHESIVE_MARKERS_ZH))]

            for i, _ in enumerate(text):
                for (marker, tokenized) in tokenized_markers:
                    if (tokenized == text[i:i + len(tokenized)]):
                        sparse_dict_increment(result, marker)
            '''

        else:
            print('language "{}" not implemented yet for cohesive_markers'.
                  format(analysis.lang))
            exit()

        pairs = [(marker, float(result[marker]) / len(text))
                 for marker in result.keys()]

        return dict(pairs)
コード例 #4
0
    def __add_token_edges(self, token):
        if len(token) < self.k:
            return
        word_start = WORD_START + token[0:self.k]
        word_end = token[-self.k:] + WORD_END

        for key in word_start, word_end:
            sparse_dict_increment(self.histogram, key)
コード例 #5
0
    def __add_token_edges(self, token):
        if len(token) < self.k:
            return
        word_start = WORD_START + token[0:self.k]
        word_end = token[-self.k:] + WORD_END

        for key in word_start, word_end:
            sparse_dict_increment(self.histogram, key)
コード例 #6
0
 def histogram(self):
     """Return a histogram of tokens in the text.
     
     >>> Analysis("Hello, hello world.").histogram()
     {'world': 1, '.': 1, 'hello': 2, ',': 1}
     """
     result = {}
     for t in self.tokens():
         sparse_dict_increment(result, t)
     return result
コード例 #7
0
ファイル: __init__.py プロジェクト: ehahn/translationese
 def histogram(self):
     """Return a histogram of tokens in the text.
     
     >>> Analysis("Hello, hello world.").histogram()
     {'world': 1, '.': 1, 'hello': 2, ',': 1}
     """
     result = {}
     for t in self.tokens():
         sparse_dict_increment(result, t)
     return result
コード例 #8
0
def quantify_variant(analysis, variant):
    n = variant + 1

    d = {}

    all_pos_tags = [ pos for (_, pos) in analysis.pos_tags() ]

    for ngram in ingrams(all_pos_tags, n):
        sparse_dict_increment(d, ngram)

    return {output_filter_ngram(k): v for (k, v) in d.items()}
コード例 #9
0
ファイル: __init__.py プロジェクト: ehahn/translationese
 def bigrams(self):
     """Returns a histogram of bigrams in the text.
     
     >>> Analysis("Hello hello hello world").bigrams()
     {('hello', 'world'): 1, ('hello', 'hello'): 2}
     """
     result = {}
     for i in range(len(self.tokens()) - 1):
         bigram = (self.tokens()[i], self.tokens()[i + 1])
         sparse_dict_increment(result, bigram)
     return result
コード例 #10
0
 def bigrams(self):
     """Returns a histogram of bigrams in the text.
     
     >>> Analysis("Hello hello hello world").bigrams()
     {('hello', 'world'): 1, ('hello', 'hello'): 2}
     """
     result = {}
     for i in range(len(self.tokens()) - 1):
         bigram = (self.tokens()[i], self.tokens()[i + 1])
         sparse_dict_increment(result, bigram)
     return result
コード例 #11
0
def quantify(analysis):
    d = {}

    word_stream = (function_word_or_POS(token, tag) for (token, tag)
                   in analysis.pos_tags())
    num_tokens = float(len(analysis.pos_tags()))

    for trigram in nltk.util.itrigrams(word_stream):
        if trigram_is_functional(trigram):
            sparse_dict_increment(d, trigram)

    return {output_filter_ngram(k): (v / num_tokens) for (k, v) in d.items()}
コード例 #12
0
def quantify(analysis):
    d = {}

    word_stream = (function_word_or_POS(token, tag)
                   for (token, tag) in analysis.pos_tags())
    num_tokens = float(len(analysis.pos_tags()))

    for trigram in nltk.util.itrigrams(word_stream):
        if trigram_is_functional(trigram):
            sparse_dict_increment(d, trigram)

    return {output_filter_ngram(k): (v / num_tokens) for (k, v) in d.items()}
コード例 #13
0
def quantify_variant(analysis, variant):
    """Quantify word n-grams"""
    n = variant + 1

    d = {}

    all_words = [word for (word, _) in analysis.pos_tags()]

    for ngram in ngrams(all_words, n):
        sparse_dict_increment(d, ngram)

    return {output_filter_ngram(k): v
            for (k, v) in d.items()}  # unnormalized counts
コード例 #14
0
def quantify(analysis):
    result = {}
    
    tokenized_markers = [(marker,nltk.word_tokenize(marker)) for marker in COHESIVE_MARKERS]
    text = analysis.tokens()
    
    for i, _ in enumerate(text):
        for (marker,tokenized) in tokenized_markers:
            if (tokenized == text[i:i+len(tokenized)]):
                sparse_dict_increment(result, marker)

    pairs = [ (marker, float(result[marker]) / len(text)) for marker in result.keys()]
    
    return dict(pairs)
コード例 #15
0
def quantify(analysis):
    assert isinstance(analysis, translationese.Analysis)

    result = {}

    for sentence in analysis.tokenized_sentences():
        if len(sentence) < 6:
            # Sentence has fewer than 5 tokens (and a period)
            continue
        for position_name, position in POSITION_NAMES.items():
            key = "%s %s" % (position_name, sentence[position])
            sparse_dict_increment(result, key)

    return result
コード例 #16
0
def quantify(analysis):
    """Quantify usage of cohesive markers."""
    result = {}
    
    tokenized_markers = [(marker,nltk.word_tokenize(marker)) for marker in COHESIVE_MARKERS]
    text = analysis.tokens()
    
    for i, _ in enumerate(text):
        for (marker,tokenized) in tokenized_markers:
            if (tokenized == text[i:i+len(tokenized)]):
                sparse_dict_increment(result, marker)

    pairs = [ (marker, float(result[marker]) / len(text)) for marker in result.keys()]
    
    return dict(pairs)
コード例 #17
0
def quantify(analysis):
    """Quantify contextual function words."""
    if analysis.lang == 'en':
        from translationese.function_words import FUNCTION_WORDS
    elif analysis.lang == 'zh':
        from translationese.function_words import FUNCTION_WORDS_ZH as FUNCTION_WORDS
    else:
        print('language "{}" not implemented yet for contextual_function_words'.format(analysis.lang))
        exit()

    d = {}

    word_stream = (function_word_or_POS(token, tag, FUNCTION_WORDS) for (token, tag)
                   in analysis.pos_tags())
    num_tokens = float(len(analysis.pos_tags()))

    for trigram in nltk.trigrams(word_stream):
        if trigram_is_functional(trigram, FUNCTION_WORDS):
            sparse_dict_increment(d, trigram)

    return {output_filter_ngram(k): (v / num_tokens) for (k, v) in d.items()}
コード例 #18
0
def quantify_variant(analysis, variant):
    """Quantify POS n-grams"""
    if variant <= 2:
        n = variant + 1

        d = {}

        all_pos_tags = [pos for (_, pos) in analysis.pos_tags()]

        for ngram in ngrams(all_pos_tags, n):
            sparse_dict_increment(d, ngram)

        return {output_filter_ngram(k): v
                for (k, v) in d.items()}  # unnormalized counts
    elif variant == 3:
        n = 3
        d = {}
        all_pos_tags = [pos for (_, pos) in analysis.pos_tags()]
        for ngram in ngrams(all_pos_tags, n):
            sparse_dict_increment(d, ngram)
        return {output_filter_ngram(k): v for (k, v) in d.items() \
                if output_filter_ngram(k) in MY_POS}  # unnormalized counts
コード例 #19
0
 def __add_token_ngrams(self, token):
     for current_ngram in ingrams(token, self.k + 1):
         sparse_dict_increment(self.histogram, ''.join(current_ngram))
コード例 #20
0
 def histogram(self):
     """Return a dictionary { "TOKEN": NUMBER_OF_OCCURENCES, ... }"""
     result = {}
     for t in self.tokens():
         sparse_dict_increment(result, t)
     return result
コード例 #21
0
ファイル: __init__.py プロジェクト: gal-star/translationese
 def histogram(self):
     """Return a dictionary { "TOKEN": NUMBER_OF_OCCURENCES, ... }"""
     result = {}
     for t in self.tokens():
         sparse_dict_increment(result, t)
     return result
コード例 #22
0
 def __add_token_ngrams(self, token):
     for current_ngram in ingrams(token, self.k + 1):
         sparse_dict_increment(self.histogram, ''.join(current_ngram))