def road_stop_token(list_line, command, stop_en):
    list_token = []
    cnt = 0
    for line in list_line:
        if command == 'abbr':  # no need to use
            split_line = line.split('\t')
            # port = PorterStemmer()
            # try:
            #     stem_word = port.stem(split_line[0])
            # except UnicodeDecodeError:
            #     # do nothing
            #     print 'Wrong stemming'
            # print split_line[0], stem_word  # get the word and stemmer word
            print split_line[0]

        elif command == 'road':
            split_line = line.split(';')
            for element in split_line:
                tokens = element.split()
                for each in tokens:
                    each = filter_token(each)
                    if (each not in list_token) and (each not in stop_en):
                        if is_int(each) is False:
                            list_token.append(each)

        elif command == 'busstop':
            cnt += 1
            split_line = line.split('\t')
            # if ('code' not in line) and ('name' not in line):
            if cnt > 1:
                tokens = split_line[1].split()
                for each in tokens:
                    filter_each = filter_token(each.strip())
                    if (filter_each not in list_token) and (len(filter_each) > 0) and (each not in stop_en):
                        if is_int(each) is False:
                            list_token.append(filter_each.strip())

        elif command == 'bussvc':
            cnt += 1
            split_line = line.split('\t')
            # if ('no' not in line) and ('routes' not in line) and ('type' not in line) and ('operator' not in line) and ('name' not in line):
            if cnt > 1:
                list_token.append(split_line[0].strip())
                # print split_line[0], cnt

    # for value in sorted(list_token):
    #     print value.lower()
    for value in list_token:
        print value.lower()
    print 'Total length of list: %i' % len(list_token)
def token_aft(list_line, command):
    # check the token after label, note that belongs to the command ('svc', 'road', 'busstop')
    text = ''
    list_length = []

    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')  # list of sentences
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')  # list of label for each word
        list_length.append(len(split_first))

        if command == 'svc':
            for k in range(0, len(split_second)):
                # check the frequency of token before bus service
                if int(split_second[k]) == 1:  # mean bus svc
                    if k < len(split_second) - 1:  # bus svc doesn't appear at the first position of sentences
                        # try:  # don't use stemming here
                        #     stem_word = port.stem(connect_token(split_first[k - 1].lower()))  # take the token before
                        # except UnicodeDecodeError:
                        #     stem_word = connect_token(split_first[k - 1].lower())
                        stem_word = connect_token(split_first[k + 1].lower())  # take the token after label
                        if is_int(stem_word) is False:
                            text = text + stem_word + ' '

                        # if stem_word == 'sd' or stem_word == 'dd':
                        #     print list_line[i]

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        # print value[0], '\t', value[1]
        print value[0]

    print text
def token_bef(list_line, command):
    # check the token before label, note that belongs to the command ('svc', 'road', 'busstop')
    port = PorterStemmer()
    text = ''
    list_length = []
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')  # list of sentences
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')  # list of label for each word
        list_length.append(len(split_first))

        if command == 'svc':
            for k in range(0, len(split_second)):
                # check the frequency of token before bus service
                if int(split_second[k]) == 1:  # mean bus svc
                    if k > 0:  # bus svc doesn't appear at the first position of sentences
                        # try:  # don't use stemming here
                        #     stem_word = port.stem(connect_token(split_first[k - 1].lower()))  # take the token before
                        # except UnicodeDecodeError:
                        #     stem_word = connect_token(split_first[k - 1].lower())
                        stem_word = connect_token(split_first[k - 1].lower())
                        if is_int(stem_word) is False:
                            text = text + stem_word + ' '

        elif command == 'road':
            k = 0
            while True:
                if k >= len(split_second):
                    break
                else:
                    try:
                        if int(split_second[k]) == 2:  # mean road
                            if k > 0:
                                stem_word = connect_token(split_first[k - 1].lower())
                                if is_int(stem_word) is False:
                                    text = text + stem_word + ' '  # take the word before

                            while True:
                                k += 1
                                if k == len(split_second):
                                    break
                                else:
                                    if int(split_second[k]) != 2:
                                        break
                        else:
                            k += 1
                    except ValueError:
                        k += 1

        elif command == 'busstop':
            k = 0
            while True:
                if k >= len(split_second):
                    break
                else:
                    try:
                        if int(split_second[k]) == 3:  # mean bus stop
                            if k > 0:
                                stem_word = connect_token(split_first[k - 1].lower())
                                if is_int(stem_word) is False:
                                    text = text + stem_word + ' '  # take the word before

                            while True:
                                k += 1
                                if k == len(split_second):
                                    break
                                else:
                                    if int(split_second[k]) != 3:
                                        break
                        else:
                            k += 1
                    except ValueError:
                        k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]
        # print value[0]

    print text
Example #4
0
def check_bef_aft_roadBusStop(list_line, command):
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        k = 0
        while True:
            if k >= len(split_second):
                break

            if command == 'bef_road':
                try:
                    if int(split_second[k]) == 2:  # take road
                        if k > 0:
                            text = text + connect_token(split_first[k - 1].lower()) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_road':
                try:
                    if int(split_second[k]) == 2:  # take road
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                        if k < len(split_second) - 1:
                            if is_int(split_first[k]) is False:
                                text = text + connect_token(split_first[k].lower()) + ' '  # take the token after the label
                    else:
                        k += 1

                except ValueError:
                    k += 1

            if command == 'bef_busstop':
                try:
                    if int(split_second[k]) == 3:  # take busstop
                        if k > 0:
                            text = text + connect_token(split_first[k - 1].lower()) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_busstop':
                try:
                    if int(split_second[k]) == 3:  # take road
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                        if k < len(split_second) - 1:
                            if is_int(split_first[k]) is False:
                                text = text + connect_token(split_first[k].lower()) + ' '  # take the token after the label
                    else:
                        k += 1

                except ValueError:
                    k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    print text