コード例 #1
0
def load_text_CRF_label(path, name, command):
    list_file = load_file(path, name)
    list_text, list_label, list_svc = list(), list(), list()
    for i in range(0, len(list_file)):
        if i % 2 == 0:
            text = list_file[i]
            list_text.append(text)
        else:
            label = list_file[i]
            list_label.append(label)

    for i in range(0, len(list_text)):
        svc, split_text, split_label = '', list_text[i].split(), list_label[i].split()
        for j in range(0, len(split_text)):
            token_label = split_label[j]
            # get the token for bus services
            if token_label == '1':
                token_text = filter_eachToken(split_text[j], command)
                svc += token_text + ' '

        if len(svc) != 0:
            list_svc.append(svc.strip())
        else:
            list_svc.append('None')

    list_all = list()
    list_all.append(list_text), list_all.append(list_label), list_all.append(list_svc)
    return list_all
コード例 #2
0
def originial_token(path, name, original_texts, filtering_texts, labels, command):
    texts_correct, labels_correct = list(), list()
    for index in range(0, len(original_texts)):
        text_org, text_fil, label = original_texts[index], filtering_texts[index], labels[index]
        split_textOrg, split_textFil, split_textLabel = text_org.split(), text_fil.split('\t'), label.split('\t')

        k = 0  # index of text labels
        line_correct, label_correct = '', ''
        for j in range(0, len(split_textOrg)):
            flag = check_token(split_textOrg[j], command)
            if flag is True:
                line_correct += split_textOrg[j] + ' '
                label_correct += '0 '
            else:
                line_correct += split_textOrg[j] + ' '
                if split_textLabel[k] == '1':
                    flag_int = RepresentsInt(filter_eachToken(split_textOrg[j], command))
                    if flag_int is True:
                        label_correct += split_textLabel[k] + ' '
                    else:
                        label_correct += '0 '
                else:
                    label_correct += split_textLabel[k] + ' '
                k += 1
        texts_correct.append(line_correct.strip()), labels_correct.append(label_correct.strip())

    list_write = list()
    for i in range(0, len(texts_correct)):
        list_write.append(texts_correct[i])
        list_write.append(labels_correct[i])
        # list_write.append('\n')

    write_file(path, name + '_' + command, list_write)
コード例 #3
0
def check_token(token, command):
    if command == 'twitter':
        text = filter_eachTok_rmLinks(token, command)
        if len(text.strip()) == 0:
            return True
        else:
            return False
    if command == 'sgforums':
        text = filter_eachToken(token, command)
        if len(text.strip()) == 0:
            return True
        else:
            return False
    if command == 'facebook':
        text = filter_eachTok_rmLinks(token, command)
        if len(text.strip()) == 0:
            return True
        else:
            return False
コード例 #4
0
def filterText_demo(list_line, command, command_data):
    list_convert = list()
    for i in range(0, len(list_line)):
        text = ''
        split_text = list_line[i].strip().split()
        for token_ in split_text:

            if command == 'removePunc':  # remove all punctuations
                token_filter = filter_eachToken(token_, command_data)
            elif command == 'removeLink':  # remove all punctuations and links in token
                token_filter = filter_eachTok_rmLinks(token_, command_data)
            else:
                print 'You need to give the correct command'
                quit()

            if len(token_filter) != 0:
                text += token_filter + '\t'

        list_convert.append(text.strip())
    return list_convert