Python token_isAllCharacter Exemples, CRF_labeling.feature_token_crf.token_isAllCharacter Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : load_crf.py Projet : hvdthong/Transportation_NEC

def load_all_dic_token_bef_road_busstop(list_line, command):
    # load all the word of token before and after labeling, note that we do not consider if this token is a
    # number. In fact, we only consider if token contain all characters
    # Using only for "road" and "busstop"

    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        k = 0
        while True:
            if k >= len(split_second):
                break

            if command == 'road':  # get the token before labeling for road
                try:
                    if int(split_second[k]) == 2:  # detect this is a road => get the token before it
                        if k > 0:
                            token_bef = split_first[k - 1].lower()
                            if token_isAllCharacter(token_bef) is True:
                                text = text + connect_token(token_bef) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'busstop':  # get the token before labeling for road
                try:
                    if int(split_second[k]) == 3:  # detect this is a road => get the token before it
                        if k > 0:
                            token_bef = split_first[k - 1].lower()
                            if token_isAllCharacter(token_bef) is True:
                                text = text + connect_token(token_bef) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    list_return = list()
    for value in fdist.most_common(len(fdist)):
        list_return.append(value[0])
        print value[0]
    print len(fdist)
    return list_return

Exemple #2

0

Afficher le fichier

Fichier : load_crf.py Projet : hvdthong/Transportation_NEC

def load_all_dic_token_bef_aft_svc(list_line, command):
    # loading all token before and after for bus service
    # Using only for bus service, because for bus service we not only focus on the token before, but also the token
    # after labeling
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        k = 0
        while True:
            if k >= len(split_second):
                break

            if command == 'bef_svc':  # get the token before labeling for bus svc
                try:
                    if int(split_second[k]) == 1:  # detect this is a svc => get the token before it
                        if k > 0:
                            token_bef = split_first[k - 1].lower()
                            if token_isAllCharacter(token_bef) is True:
                                text = text + connect_token(token_bef) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 1:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_svc':
                try:
                    if int(split_second[k]) == 1:  # take bus svc
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 1:
                                    break
                        if k < len(split_second) - 1:
                            # take the token after the label
                            token_aft = split_first[k].lower()
                            if token_isAllCharacter(token_aft) is True:
                                text = text + connect_token(token_aft) + ' '
                    else:
                        k += 1

                except ValueError:
                    k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    list_return = list()
    for value in fdist.most_common(len(fdist)):
        list_return.append(value[0])
        print value[0]
    print len(fdist)
    return list_return

Exemple #3

0

Afficher le fichier

Fichier : feature_crf_all.py Projet : hvdthong/Transportation_NEC

__author__ = 'vdthoang'