예제 #1
0
def tokenizing(csv_import_path, csv_export_path):
    csv_obj = CSV(csv_import_path)
    csv_data = csv_obj.get_data()

    sentence_arr = []
    for row in csv_data:
        for cell in row:
            sentence_arr.append(cell)

    tokenizer = MeCabTokenizer(tagger='-Ochasen')
    output_arr = []
    stop_words = ['。', '、', '・']
    for sentence in sentence_arr:
        tokens = tokenizer.parse_to_node(sentence)
        surface = []
        while tokens:
            if tokens.surface and tokens.surface not in stop_words:
                surface.append(tokens.surface)
            tokens = tokens.next
        if len(surface) > 0:
            output_arr.append([sentence, " ".join(surface)])

    csv_obj.export(csv_export_path, output_arr)