def reg_bussvc(list_line, n_token, command): # check if the token match the regular expression for bus service or not list_ftr = [] for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].lower().strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') ftr = '' for k in range(0, len(split_first)): token = split_first[k].strip() if pattern_token_bussvc(token, command) is True: ftr += '1' else: range_k = range_text_index(k, len(split_first), n_token) token_text = '' for m in range(range_k[1], range_k[2] + 1): token_text = token_text + ' ' + split_first[m] token_text = token_text.strip() if pattern_tokenText_bussvc(token, token_text, command): ftr += '1' else: ftr += '0' # print len(split_first), split_first # print len(ftr), ftr print i / 3 list_ftr.append(ftr) return list_ftr
def extract_ftr_gt_road_busstop(list_line, command, n_token): list_dict = load_dict(command) list_ftr = [] cnt = 0 for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') for k in range(0, len(split_second)): label = 0 if command == 'road': label = 2 elif command == 'busstop': label = 3 if label == 0: # quit if we don't have the correct command print 'Give the correct command' quit() if (int(split_second[k] == label)) or (split_first[k].strip().lower() in list_dict): word = split_first[k].lower() + ' ' # take the word which in svc dictionary range_k = range_text_index(k, len(split_second), n_token) ftr_text = '' for m in range(range_k[1], range_k[2] + 1): ftr_text = ftr_text + ' ' + split_first[m] ftr_text = ftr_text.strip() if int(split_second[k]) == label: list_ftr.append(str(cnt) + '\t' + 'TRUE' + '\t' + word + '\t' + ftr_text) else: list_ftr.append(str(cnt) + '\t' + 'FALSE' + '\t' + word + '\t' + ftr_text) cnt += 1 for value in list_ftr: print value print 'Length of list features is: %i' % len(list_ftr)
def extract_ftr_gt_svc(list_line, load_svc, n_token): list_ftr = [] cnt = 0 cnt_true_svc = 0 for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') for k in range(0, len(split_second)): if split_first[k] in load_svc: # mean bus svc svc = split_first[k].lower() + ' ' # take the word which in svc dictionary range_k = range_text_index(k, len(split_second), n_token) ftr_text = '' for m in range(range_k[1], range_k[2] + 1): ftr_text = ftr_text + ' ' + split_first[m] ftr_text = ftr_text.strip() if int(split_second[k]) == 1: list_ftr.append(str(cnt) + '\t' + 'TRUE' + '\t' + svc + '\t' + ftr_text) elif int(split_second[k]) == 0: list_ftr.append(str(cnt) + '\t' + 'FALSE' + '\t' + svc + '\t' + ftr_text) if int(split_second[k]) == 1: cnt_true_svc += 1 cnt += 1 for value in list_ftr: print value print 'Length of list features is: %i' % len(list_ftr) print 'Length of bus service labeling TRUE is: %i' % cnt_true_svc