def load_text_CRF_label(path, name, command): list_file = load_file(path, name) list_text, list_label, list_svc = list(), list(), list() for i in range(0, len(list_file)): if i % 2 == 0: text = list_file[i] list_text.append(text) else: label = list_file[i] list_label.append(label) for i in range(0, len(list_text)): svc, split_text, split_label = '', list_text[i].split(), list_label[i].split() for j in range(0, len(split_text)): token_label = split_label[j] # get the token for bus services if token_label == '1': token_text = filter_eachToken(split_text[j], command) svc += token_text + ' ' if len(svc) != 0: list_svc.append(svc.strip()) else: list_svc.append('None') list_all = list() list_all.append(list_text), list_all.append(list_label), list_all.append(list_svc) return list_all
def originial_token(path, name, original_texts, filtering_texts, labels, command): texts_correct, labels_correct = list(), list() for index in range(0, len(original_texts)): text_org, text_fil, label = original_texts[index], filtering_texts[index], labels[index] split_textOrg, split_textFil, split_textLabel = text_org.split(), text_fil.split('\t'), label.split('\t') k = 0 # index of text labels line_correct, label_correct = '', '' for j in range(0, len(split_textOrg)): flag = check_token(split_textOrg[j], command) if flag is True: line_correct += split_textOrg[j] + ' ' label_correct += '0 ' else: line_correct += split_textOrg[j] + ' ' if split_textLabel[k] == '1': flag_int = RepresentsInt(filter_eachToken(split_textOrg[j], command)) if flag_int is True: label_correct += split_textLabel[k] + ' ' else: label_correct += '0 ' else: label_correct += split_textLabel[k] + ' ' k += 1 texts_correct.append(line_correct.strip()), labels_correct.append(label_correct.strip()) list_write = list() for i in range(0, len(texts_correct)): list_write.append(texts_correct[i]) list_write.append(labels_correct[i]) # list_write.append('\n') write_file(path, name + '_' + command, list_write)
def check_token(token, command): if command == 'twitter': text = filter_eachTok_rmLinks(token, command) if len(text.strip()) == 0: return True else: return False if command == 'sgforums': text = filter_eachToken(token, command) if len(text.strip()) == 0: return True else: return False if command == 'facebook': text = filter_eachTok_rmLinks(token, command) if len(text.strip()) == 0: return True else: return False
def filterText_demo(list_line, command, command_data): list_convert = list() for i in range(0, len(list_line)): text = '' split_text = list_line[i].strip().split() for token_ in split_text: if command == 'removePunc': # remove all punctuations token_filter = filter_eachToken(token_, command_data) elif command == 'removeLink': # remove all punctuations and links in token token_filter = filter_eachTok_rmLinks(token_, command_data) else: print 'You need to give the correct command' quit() if len(token_filter) != 0: text += token_filter + '\t' list_convert.append(text.strip()) return list_convert