Esempio n. 1
0
def normalize_amazon_corpus(input_file, output_file):
    rx = re.compile('III[^I]+III')
    reader = TSVReader(input_file)
    writer = TSVWriter(output_file)
    skips = 0
    for i, data in enumerate(reader):
        norm_tokens = []
        for token in data.tokens:
            m = rx.search(token)
            if m:
                match = m.group(0)
                splits = rx.split(token)
                norm_splits = []
                for split in splits:
                    norm_splits.append(normalize(split))
                norm_token = match.join(norm_splits)
                norm_tokens.append(norm_token)
            else:
                norm_tokens.append(normalize(token))

        data.tokens = norm_tokens
        #word_index = amazon_char_to_word_index(data.tokens, data.index)
        #if word_index == -1 or not data.tokens[word_index] in ['then', 'than']:
        #   skips += 1
        #    continue
        #data.index = word_index
        writer.out(data)
    print('total skips: %d' % skips)
    writer.close()
Esempio n. 2
0
def normalize_amazon_corpus(input_file, output_file):
    rx = re.compile('III[^I]+III')
    reader = TSVReader(input_file)
    writer = TSVWriter(output_file)
    skips = 0
    for i, data in enumerate(reader):
        norm_tokens = []
        for token in data.tokens:
            m = rx.search(token)
            if m:
                match = m.group(0)
                splits = rx.split(token)
                norm_splits = []
                for split in splits:
                    norm_splits.append(normalize(split))
                norm_token = match.join(norm_splits)
                norm_tokens.append(norm_token)
            else:
                norm_tokens.append(normalize(token))

        data.tokens = norm_tokens
        #word_index = amazon_char_to_word_index(data.tokens, data.index)
        #if word_index == -1 or not data.tokens[word_index] in ['then', 'than']:
        #   skips += 1
        #    continue
        #data.index = word_index
        writer.out(data)
    print('total skips: %d' % skips)
    writer.close()
Esempio n. 3
0
def combine_parts_of_amazon_corpus(full_corpus, fixed_part_corpus, out_corpus,
                                   confusion_set):
    full_reader = TSVReader(full_corpus)
    fixed_part_reader = TSVReader(fixed_part_corpus)
    fixed_part_iter = fixed_part_reader.__iter__()
    writer = TSVWriter(out_corpus)
    counter = 0
    then_correct_counter = 0
    then_wrong_counter = 0
    than_correct_counter = 0
    than_wrong_counter = 0

    for data in full_reader:
        if data.is_correct:
            if data.tokens[
                    data.
                    index] in confusion_set and data.label in confusion_set:
                writer.out(data)
        else:
            try:
                data = fixed_part_reader.__next__()
                counter += 1
            except:
                continue
            if not (data.tokens[data.index] in confusion_set
                    and data.label in confusion_set):
                continue
            if data.label == confusion_set[0]:
                data.tokens[data.index] = confusion_set[1]
            elif data.label == confusion_set[1]:
                data.tokens[data.index] = confusion_set[0]
            else:
                continue
            data.is_correct = False
            writer.out(data)

        if data.tokens[data.index] == 'then':
            if data.is_correct:
                then_correct_counter += 1
            else:
                then_wrong_counter += 1
        else:
            if data.is_correct:
                than_correct_counter += 1
            else:
                than_wrong_counter += 1

    writer.close()
    print(counter)
    print('Then Correct: %d' % then_correct_counter)
    print('Then Wrong: %d' % then_wrong_counter)
    print('Than Correct: %d' % than_correct_counter)
    print('Than Wrong: %d' % than_wrong_counter)
Esempio n. 4
0
def split_amazon_error_corpus(input_path: str, output_intend_path: str, output_error_path: str, confusion_set:(str, str)):
    reader = TSVReader(input_path)
    intend_writer = TSVWriter(output_intend_path)
    error_writer = TSVWriter(output_error_path)

    for data in reader:
        if data.label == confusion_set[0] and data.is_correct:
            intend_writer.out(data)
        elif data.label == confusion_set[1] and not data.is_correct:
            error_writer.out(data)

    intend_writer.close()
    error_writer.close()
Esempio n. 5
0
def filter_corpus_to_tsv(input_path: str, output_path: str, confusion_set: (str, str)):
    open_input = open(input_path, 'r')
    writer = TSVWriter(output_path)

    for line in open_input:
        tokens, pos = split_tokens_and_pos(line.split())
        for i, token in enumerate(tokens):
            if token in confusion_set:
                d = TSVData(True, token, i, tokens, pos)
                writer.out(d)

    open_input.close()
    writer.close()
Esempio n. 6
0
def filter_corpus_to_tsv(input_path: str, output_path: str,
                         confusion_set: (str, str)):
    open_input = open(input_path, 'r')
    writer = TSVWriter(output_path)

    for line in open_input:
        tokens, pos = split_tokens_and_pos(line.split())
        for i, token in enumerate(tokens):
            if token in confusion_set:
                d = TSVData(True, token, i, tokens, pos)
                writer.out(d)

    open_input.close()
    writer.close()
Esempio n. 7
0
def combine_parts_of_amazon_corpus(full_corpus, fixed_part_corpus, out_corpus, confusion_set):
    full_reader = TSVReader(full_corpus)
    fixed_part_reader = TSVReader(fixed_part_corpus)
    fixed_part_iter = fixed_part_reader.__iter__()
    writer = TSVWriter(out_corpus)
    counter = 0
    then_correct_counter = 0
    then_wrong_counter = 0
    than_correct_counter = 0
    than_wrong_counter = 0

    for data in full_reader:
        if data.is_correct:
            if data.tokens[data.index] in confusion_set and data.label in confusion_set:
                writer.out(data)
        else:
            try:
                data = fixed_part_reader.__next__()
                counter += 1
            except:
                continue
            if not (data.tokens[data.index] in confusion_set and data.label in confusion_set):
                continue
            if data.label == confusion_set[0]:
                data.tokens[data.index] = confusion_set[1]
            elif data.label == confusion_set[1]:
                data.tokens[data.index] = confusion_set[0]
            else:
                continue
            data.is_correct = False
            writer.out(data)

        if data.tokens[data.index] == 'then':
            if data.is_correct:
                then_correct_counter += 1
            else:
                then_wrong_counter += 1
        else:
            if data.is_correct:
                than_correct_counter += 1
            else:
                than_wrong_counter += 1

    writer.close()
    print(counter)
    print('Then Correct: %d' % then_correct_counter)
    print('Then Wrong: %d' % then_wrong_counter)
    print('Than Correct: %d' % than_correct_counter)
    print('Than Wrong: %d' % than_wrong_counter)
Esempio n. 8
0
def directed_error_corpus(input_path: str, output_intend_path: str, output_error_path: str, confusion_set:(str, str)):
    reader = TSVReader(input_path)
    intend_writer = TSVWriter(output_intend_path)
    error_writer = TSVWriter(output_error_path)

    for data in reader:
        if data.label == confusion_set[0]:
            intend_writer.out(data)
        elif data.label == confusion_set[1]:
            data.is_correct = False
            data.tokens[data.index] = confusion_set[0]
            error_writer.out(data)

    intend_writer.close()
    error_writer.close()
Esempio n. 9
0
def split_amazon_error_corpus(input_path: str, output_intend_path: str,
                              output_error_path: str,
                              confusion_set: (str, str)):
    reader = TSVReader(input_path)
    intend_writer = TSVWriter(output_intend_path)
    error_writer = TSVWriter(output_error_path)

    for data in reader:
        if data.label == confusion_set[0] and data.is_correct:
            intend_writer.out(data)
        elif data.label == confusion_set[1] and not data.is_correct:
            error_writer.out(data)

    intend_writer.close()
    error_writer.close()
Esempio n. 10
0
def directed_error_corpus(input_path: str, output_intend_path: str,
                          output_error_path: str, confusion_set: (str, str)):
    reader = TSVReader(input_path)
    intend_writer = TSVWriter(output_intend_path)
    error_writer = TSVWriter(output_error_path)

    for data in reader:
        if data.label == confusion_set[0]:
            intend_writer.out(data)
        elif data.label == confusion_set[1]:
            data.is_correct = False
            data.tokens[data.index] = confusion_set[0]
            error_writer.out(data)

    intend_writer.close()
    error_writer.close()
Esempio n. 11
0
def dynamically_split_tsv(paths, confusion_set, test=0, cv=False):
    split_percentage = 20
    num_of_splits = 5

    token_a = confusion_set[0]
    token_b = confusion_set[1]

    frequencies = token_freq_tsv(paths['filtered_tsv'], confusion_set)
    token_a_freq = frequencies[0]
    token_b_freq = frequencies[1]

    split_token_a_freq = (token_a_freq * split_percentage) // 100
    split_token_b_freq = (token_b_freq * split_percentage) // 100

    tmp_dir = paths['tmp']

    splits = []
    for i in range(num_of_splits):
        file = os.path.join(tmp_dir, 'split%d.tsv' % i)
        dict = {'token_a_count': 0,
                'token_b_count': 0,
                'file': file,
                'io': TSVWriter(file)}
        splits.append(dict)

    reader = TSVReader(paths['filtered_tsv'])

    for d in reader:
        for split in splits:
            if d.label == token_a:
                if split['token_a_count'] < split_token_a_freq:
                    split['token_a_count'] = split['token_a_count'] + 1
                    split['io'].out(d)
                    break
            elif d.label == token_b:
                if split['token_b_count'] < split_token_b_freq:
                    split['token_b_count'] = split['token_b_count'] + 1
                    split['io'].out(d)
                    break

    for split in splits:
        split['io'].close()
        split['io'] = TSVReader(split['file'])

    if cv:
        cv_writer = TSVWriter(paths['cv_intend_tsv'])
        for d in splits[cv]['io']:
            cv_writer.out(d)
        cv_writer.close()
    
    test_writer = TSVWriter(paths['test_intend_tsv'])
    for d in splits[test]['io']:
        test_writer.out(d)
    test_writer.close()

    train_writer = TSVWriter(paths['train_intend_tsv'])
    for i, split in enumerate(splits):
        if (cv and i == cv) or i == test:
            continue
        for d in split['io']:
            train_writer.out(d)
    train_writer.close()

    for split in splits:
        os.remove(split['file'])
Esempio n. 12
0
def dynamically_split_tsv(paths, confusion_set, test=0, cv=False):
    split_percentage = 20
    num_of_splits = 5

    token_a = confusion_set[0]
    token_b = confusion_set[1]

    frequencies = token_freq_tsv(paths['filtered_tsv'], confusion_set)
    token_a_freq = frequencies[0]
    token_b_freq = frequencies[1]

    split_token_a_freq = (token_a_freq * split_percentage) // 100
    split_token_b_freq = (token_b_freq * split_percentage) // 100

    tmp_dir = paths['tmp']

    splits = []
    for i in range(num_of_splits):
        file = os.path.join(tmp_dir, 'split%d.tsv' % i)
        dict = {
            'token_a_count': 0,
            'token_b_count': 0,
            'file': file,
            'io': TSVWriter(file)
        }
        splits.append(dict)

    reader = TSVReader(paths['filtered_tsv'])

    for d in reader:
        for split in splits:
            if d.label == token_a:
                if split['token_a_count'] < split_token_a_freq:
                    split['token_a_count'] = split['token_a_count'] + 1
                    split['io'].out(d)
                    break
            elif d.label == token_b:
                if split['token_b_count'] < split_token_b_freq:
                    split['token_b_count'] = split['token_b_count'] + 1
                    split['io'].out(d)
                    break

    for split in splits:
        split['io'].close()
        split['io'] = TSVReader(split['file'])

    if cv:
        cv_writer = TSVWriter(paths['cv_intend_tsv'])
        for d in splits[cv]['io']:
            cv_writer.out(d)
        cv_writer.close()

    test_writer = TSVWriter(paths['test_intend_tsv'])
    for d in splits[test]['io']:
        test_writer.out(d)
    test_writer.close()

    train_writer = TSVWriter(paths['train_intend_tsv'])
    for i, split in enumerate(splits):
        if (cv and i == cv) or i == test:
            continue
        for d in split['io']:
            train_writer.out(d)
    train_writer.close()

    for split in splits:
        os.remove(split['file'])