def road_stop_token(list_line, command, stop_en): list_token = [] cnt = 0 for line in list_line: if command == 'abbr': # no need to use split_line = line.split('\t') # port = PorterStemmer() # try: # stem_word = port.stem(split_line[0]) # except UnicodeDecodeError: # # do nothing # print 'Wrong stemming' # print split_line[0], stem_word # get the word and stemmer word print split_line[0] elif command == 'road': split_line = line.split(';') for element in split_line: tokens = element.split() for each in tokens: each = filter_token(each) if (each not in list_token) and (each not in stop_en): if is_int(each) is False: list_token.append(each) elif command == 'busstop': cnt += 1 split_line = line.split('\t') # if ('code' not in line) and ('name' not in line): if cnt > 1: tokens = split_line[1].split() for each in tokens: filter_each = filter_token(each.strip()) if (filter_each not in list_token) and (len(filter_each) > 0) and (each not in stop_en): if is_int(each) is False: list_token.append(filter_each.strip()) elif command == 'bussvc': cnt += 1 split_line = line.split('\t') # if ('no' not in line) and ('routes' not in line) and ('type' not in line) and ('operator' not in line) and ('name' not in line): if cnt > 1: list_token.append(split_line[0].strip()) # print split_line[0], cnt # for value in sorted(list_token): # print value.lower() for value in list_token: print value.lower() print 'Total length of list: %i' % len(list_token)
def token_aft(list_line, command): # check the token after label, note that belongs to the command ('svc', 'road', 'busstop') text = '' list_length = [] for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') # list of sentences j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') # list of label for each word list_length.append(len(split_first)) if command == 'svc': for k in range(0, len(split_second)): # check the frequency of token before bus service if int(split_second[k]) == 1: # mean bus svc if k < len(split_second) - 1: # bus svc doesn't appear at the first position of sentences # try: # don't use stemming here # stem_word = port.stem(connect_token(split_first[k - 1].lower())) # take the token before # except UnicodeDecodeError: # stem_word = connect_token(split_first[k - 1].lower()) stem_word = connect_token(split_first[k + 1].lower()) # take the token after label if is_int(stem_word) is False: text = text + stem_word + ' ' # if stem_word == 'sd' or stem_word == 'dd': # print list_line[i] fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): # print value[0], '\t', value[1] print value[0] print text
def token_bef(list_line, command): # check the token before label, note that belongs to the command ('svc', 'road', 'busstop') port = PorterStemmer() text = '' list_length = [] for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') # list of sentences j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') # list of label for each word list_length.append(len(split_first)) if command == 'svc': for k in range(0, len(split_second)): # check the frequency of token before bus service if int(split_second[k]) == 1: # mean bus svc if k > 0: # bus svc doesn't appear at the first position of sentences # try: # don't use stemming here # stem_word = port.stem(connect_token(split_first[k - 1].lower())) # take the token before # except UnicodeDecodeError: # stem_word = connect_token(split_first[k - 1].lower()) stem_word = connect_token(split_first[k - 1].lower()) if is_int(stem_word) is False: text = text + stem_word + ' ' elif command == 'road': k = 0 while True: if k >= len(split_second): break else: try: if int(split_second[k]) == 2: # mean road if k > 0: stem_word = connect_token(split_first[k - 1].lower()) if is_int(stem_word) is False: text = text + stem_word + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break else: k += 1 except ValueError: k += 1 elif command == 'busstop': k = 0 while True: if k >= len(split_second): break else: try: if int(split_second[k]) == 3: # mean bus stop if k > 0: stem_word = connect_token(split_first[k - 1].lower()) if is_int(stem_word) is False: text = text + stem_word + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] # print value[0] print text
def check_bef_aft_roadBusStop(list_line, command): text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') k = 0 while True: if k >= len(split_second): break if command == 'bef_road': try: if int(split_second[k]) == 2: # take road if k > 0: text = text + connect_token(split_first[k - 1].lower()) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break else: k += 1 except ValueError: k += 1 if command == 'aft_road': try: if int(split_second[k]) == 2: # take road while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break if k < len(split_second) - 1: if is_int(split_first[k]) is False: text = text + connect_token(split_first[k].lower()) + ' ' # take the token after the label else: k += 1 except ValueError: k += 1 if command == 'bef_busstop': try: if int(split_second[k]) == 3: # take busstop if k > 0: text = text + connect_token(split_first[k - 1].lower()) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break else: k += 1 except ValueError: k += 1 if command == 'aft_busstop': try: if int(split_second[k]) == 3: # take road while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break if k < len(split_second) - 1: if is_int(split_first[k]) is False: text = text + connect_token(split_first[k].lower()) + ' ' # take the token after the label else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] print text