Beispiel #1
0
def remove_stop_postag(dataset, output_dir):
    utils.mkdir(output_dir)
    stack = os.listdir(dataset)
    # print 'loading data in ' + dataset
    total_doc = 0
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as fr:
                data = unicodedata.normalize('NFKC', fr.read().strip())
                original_content = tokenizer.predict(data)
                content = map(lambda x: ViPosTagger.postagging(x),
                              spliter.split(original_content))
                clean_content = []
                for info in content:
                    sen = []
                    for i in xrange(len(info[0])):
                        if is_exist(info[1][i]):
                            sen.append(info[0][i])
                    clean_content.append(u' '.join(sen))
                with open(os.path.join(output_dir, os.path.basename(file_name)),
                          'w', encoding='utf-8') as fw:
                    if len(clean_content) > 0:
                        fw.write(u'\n'.join(clean_content))
                    else: fw.write(original_content)
                total_doc += 1
Beispiel #2
0
def load_dataset_from_list(list_samples, remove_tags=False):
    result = []
    for sample in list_samples:
        if remove_tags:
            sample = sample.split(u'[tags] : ')
            sample = sample[0]
        sample = r.run(tokenizer.predict(sample))
        result.append(sample)
    return result
Beispiel #3
0
def preprocessing(data, tokenize=True):
    data = unicodedata.normalize('NFKC', data)
    if tokenize:
        data = tokenizer.predict(data)
    data = my_regex.detect_url.sub(u'', data)
    data = my_regex.detect_url2.sub(u'', data)
    data = my_regex.detect_email.sub(u'', data)
    data = my_regex.detect_datetime.sub(u'', data)
    data = my_regex.detect_num.sub(u'', data)
    data = my_regex.normalize_special_mark.sub(u' \g<special_mark> ', data)
    data = my_regex.detect_exception_chars.sub(u'', data)
    data = my_regex.detect_special_mark.sub(u'', data)
    data = my_regex.detect_special_mark2.sub(u'', data)
    data = my_regex.detect_special_mark3.sub(u'', data)
    data = my_regex.normalize_space.sub(u' ', data)
    return data.strip()
Beispiel #4
0
def load_dataset(dataset):
    list_samples = {k: [] for k in my_map.name2label.keys()}
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print(file_path)
            with open(file_path, 'r', encoding='utf-16') as fp:
                content = unicodedata.normalize('NFKC', fp.read())
                content = r.run(tokenizer.predict(content))
                dir_name = utils.get_dir_name(file_path)
                list_samples[dir_name].append(content)
    return list_samples
Beispiel #5
0
def load_dataset_from_disk(dataset, remove_tags=False):
    list_samples = []
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utilities.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-8') as fp:
                content = unicodedata.normalize('NFKC', fp.read())
                if remove_tags:
                    content = content.split(u'[tags] : ')
                    content = content[0]
                content = r.run(tokenizer.predict(content))
                list_samples.append(content)
    print('')
    return list_samples
def load_dataset_from_list(list_samples):
    result = []
    for sample in list_samples:
        sample = r.run(tokenizer.predict(sample))
        result.append(sample)
    return result