Ejemplo n.º 1
0
def skip_1(words, index):
    '''Simple 1-length skip_gram method.
    
    w1 w w2 will add 'before_w1' and 'after_w2' to feature vector.
    '''
    dict_ = {}
    if not words:
        return dict_
    # adding -1, + 1 words
    if index - 1 >= 0:
        w_minus_1 = txt.removeToken(words[index - 1])
        dict_['before_' + str(w_minus_1)] = 1
    if index + 1 < len(words):
        w_plus_1 = txt.removeToken(words[index + 1])
        dict_['after_' + str(w_plus_1)] = 1
    return dict_
Ejemplo n.º 2
0
def skip_2(words, index):
    '''Simple 2-length skip_gram method.
    
    w1 w2 w w3 w4 will add 'before_w1', 'before_w2', 'after_w3' and 'after_w4'
    to feature vector.
    '''
    dict_ = {}
    if not words:
        return dict_
    # adding -1, + 1 words
    dict_.update(skip_1(words, index))
    # adding -2, + 2 words
    if index - 2 >= 0:
        w_minus_2 = txt.removeToken(words[index - 2])
        dict_['before_' + str(w_minus_2)] = 1
    if index + 2 < len(words):
        w_plus_2 = txt.removeToken(words[index + 2])
        dict_['after_' + str(w_plus_2)] = 1
    return dict_
Ejemplo n.º 3
0
def gram_skip_2(words, index, n=4):
    '''ngram on skip_2_gram, continuous.
    
    w1 w2 w w3 w4 will add a 3gram of w1 w2 w3 w4.
    '''
    dict_ = {}
    if not words:
        return dict_
    skipped = []
    if index - 2 >= 0:
        skipped.append(txt.removeToken(words[index - 2]))
    if index - 1 >= 0:
        skipped.append(txt.removeToken(words[index - 1]))
    if index + 1 < len(words):
        skipped.append(txt.removeToken(words[index + 1]))
    if index + 2 < len(words):
        skipped.append(txt.removeToken(words[index + 2]))
    skipped = ''.join(skipped)
    for i in range(len(skipped)-n):
        dict_[skipped[i:i + n]] = 1
    return dict_