def load_all_dic_token_bef_road_busstop(list_line, command): # load all the word of token before and after labeling, note that we do not consider if this token is a # number. In fact, we only consider if token contain all characters # Using only for "road" and "busstop" text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') k = 0 while True: if k >= len(split_second): break if command == 'road': # get the token before labeling for road try: if int(split_second[k]) == 2: # detect this is a road => get the token before it if k > 0: token_bef = split_first[k - 1].lower() if token_isAllCharacter(token_bef) is True: text = text + connect_token(token_bef) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break else: k += 1 except ValueError: k += 1 if command == 'busstop': # get the token before labeling for road try: if int(split_second[k]) == 3: # detect this is a road => get the token before it if k > 0: token_bef = split_first[k - 1].lower() if token_isAllCharacter(token_bef) is True: text = text + connect_token(token_bef) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] list_return = list() for value in fdist.most_common(len(fdist)): list_return.append(value[0]) print value[0] print len(fdist) return list_return
def load_all_dic_token_bef_aft_svc(list_line, command): # loading all token before and after for bus service # Using only for bus service, because for bus service we not only focus on the token before, but also the token # after labeling text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') k = 0 while True: if k >= len(split_second): break if command == 'bef_svc': # get the token before labeling for bus svc try: if int(split_second[k]) == 1: # detect this is a svc => get the token before it if k > 0: token_bef = split_first[k - 1].lower() if token_isAllCharacter(token_bef) is True: text = text + connect_token(token_bef) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 1: break else: k += 1 except ValueError: k += 1 if command == 'aft_svc': try: if int(split_second[k]) == 1: # take bus svc while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 1: break if k < len(split_second) - 1: # take the token after the label token_aft = split_first[k].lower() if token_isAllCharacter(token_aft) is True: text = text + connect_token(token_aft) + ' ' else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] list_return = list() for value in fdist.most_common(len(fdist)): list_return.append(value[0]) print value[0] print len(fdist) return list_return
__author__ = 'vdthoang'