Example #1
0
	def spacing(self, inputText, writeFile) :
		if len(inputText) < 198 :
			writeFile.write( spacing( inputText ) + '\n')

		else :
			textTemp 	= inputText[:197]
			reText		= inputText[198:]
			writeFile.write( spacing( textTemp ) + '\n')
			self.spacing(reText, writeFile)
def data_pre(message):
    global api
    global tokenizer
    message = message.replace(' ', '')
    message = spacing(message)
    api = KhaiiiApi()
    test_tokn = api.analyze(message)
    test_sentence = ""
    for eojeol in test_tokn:
        for morph in eojeol.morphs:
            test_sentence += str(morph) + " "
    test_sentence = "[CLS] " + test_sentence + " [SEP]"
    test_sentence = tokenizer.tokenize(test_sentence)
    while ('_' in test_sentence):
        test_sentence.remove('_')
    test_sentence_ids = tokenizer.convert_tokens_to_ids(test_sentence)
    test_sentence_ids = pad_sequences([test_sentence_ids],
                                      maxlen=MAX_LEN,
                                      dtype="long",
                                      truncating="post",
                                      padding="post")
    while ('_' in test_sentence):
        test_sentence.remove('_')
    test_sentence_mask = [[float(i > 0) for i in test_sentence_ids[0]]]
    test_inputs = torch.tensor(test_sentence_ids)
    test_masks = torch.tensor(test_sentence_mask)
    return (test_inputs, test_masks)
Example #3
0
def split_with_enter_and_spacing(text):
    new_text_list = text.split('\n')
    new_text_list = [sent.strip() for sent in new_text_list]
    new_text_list = [spacing(sent) for sent in new_text_list]
    while '' in new_text_list:
        new_text_list.remove('')

    return new_text_list
Example #4
0
    def correct_spacing(self):
        try:
            from pykospacing import spacing
        except ImportError:
            raise ImportError("[-] plz installing KoSpacing package first!")

        len_data = len(self.data)
        for idx in tqdm(range(len_data)):
            self.data[idx]['comment'] = spacing(self.data[idx]['comment'])
Example #5
0
    def do_GET(self):
        print(u"[START]: Received GET for %s" % (self.path))
        if self.path.startswith("/spacing?"):
            query_string = self.path.partition('?')[2]
            body = unquote(query_string.split('=')[1])

            self.send_response(200)
            self.send_header("Content-type", "text/plain;charset=utf-8")
            self.end_headers()
            self.wfile.write(bytes(spacing(body), "utf-8"))
def parse_entire_body(content):
    result = str(content.get_text())

    # 공백 및 개행문자 정리
    result = re.sub(r'(\s|\u180B|\u200B|\u200C|\u200D|\u2060|\uFEFF)+', '',
                    result)

    # 띄어쓰기 처리
    result = spacing(result)

    return result.strip()
def spell_check_text(texts):
    corpus = []
    for sent in texts:
        spaced_text = spacing(sent)
        spelled_sent = spell_checker.check(sent)
        checked_sent = spelled_sent.checked
        normalized_sent = repeat_normalize(checked_sent)
        for lownword in lownword_map:
            normalized_sent = normalized_sent.replace(lownword, lownword_map[lownword])
        corpus.append(normalized_sent)
    return corpus
Example #8
0
    def tokenizer(self, corpus, save_name, space=False):
        toeknizer = Mecab(dicpath='C:/mecab/mecab-ko-dic')
        total_lines = sum(1 for line in open(corpus, 'r', encoding='utf-8'))
        # tqdm에 입력할 total값을 구하기 위해 사용

        with open(corpus, 'r', encoding='utf-8') as f1, open(save_name, 'w', encoding='utf-8') as f2:
            for _, line in tqdm(enumerate(f1), total=total_lines):
                sentence = line.replace('\n','').strip()
                if space: 
                    sentence = spacing(line.replace('\n', '').strip())
                toeknized_sent = ' '.join(tokenizer.morphs(sentence))
                f2.writelines(toeknized_sent+'\n')
Example #9
0
def get_texts(data_path):
    file = open(data_path, 'r',encoding='UTF-8')
    data =file.readlines()
    data_list = []
    for sentence in data:
        list_sentence = sentence.replace('\n','').split('.')
        for lines in list_sentence:
            line = spacing(lines)
            data_list.append(line)
    texts = list(set(data_list))
    if '' in texts:
        texts.remove('')
    return texts
Example #10
0
    def do_POST(self):
        content_length = int(
            self.headers['Content-Length'])  # <--- Gets the size of data
        post_data = self.rfile.read(
            content_length)  # <--- Gets the data itself
        decoded_post_data = post_data.decode('utf-8')

        self.send_response(200)
        self.send_header("Content-type", "text/plain;charset=utf-8")
        # self.send_header('Access-Control-Allow-Credentials', 'true')
        # self.send_header('Access-Control-Allow-Origin', 'http://localhost:8888')
        self.end_headers()
        self.wfile.write(bytes(spacing(decoded_post_data), "utf-8"))
Example #11
0
def main(args=sys.argv[1:]):
    args = get_parser().parse_args(args)

    source = args.infile.read()
    result = '\n'.join([spacing(_) for _ in source.splitlines()])

    if args.overwrite:
        args.infile.close()
        with open(args.infile.name, 'w', encoding=args.infile.encoding) as f:
            f.write(result)
    else:
        args.outfile.write(result)

    return 0 if (source == result) else 1
Example #12
0
def Spacing_text(text_list):
    spacing_list = []
    for i in text_list:
        if len(i) < 197:
            spacing_list.append(spacing(i))
        else:
            iteration = int(len(i) / 197)
            mod = len(i) % 197
            start = 0
            end = 197
            check = 0
            while True:
                # 시행횟수 < 몫
                if check < iteration:
                    spacing_list.append(spacing(i[start:end]))
                    start += 197
                    end += 197
                    check += 1
                else:
                    # 마지막 횟수 + 나머지 더 slice
                    spacing_list.append(
                        spacing(i[iteration * 197:(iteration * 197) + mod]))
                    break
    return spacing_list
Example #13
0
def process_reviews(reviews):
    processed_reviews = []
    for review in reviews:
        review = repeat_normalize(review,
                                  num_repeats=2)  # normalize repeats by two
        review = spacing(review)  # space by words
        review = ('.').join(split_sentences(review))  # split by sentence
        try:
            review = spell_checker.check(review).as_dict()['checked']
        except:
            print('pass')
            pass
        print(review)
        processed_reviews.append(review)
        time.sleep(0.5)
    return processed_reviews
Example #14
0
def text_preprocessor(text):
    corpus = []

    p_text = paragraph_tokenize(text)
    pp_text = [clean_punc(text, punct, punct_mapping) for text in p_text]
    ppc_text = [sents for sents in clean_text(pp_text) if sents != '']

    for sent in ppc_text:
        spaced_text = spacing(sent)
        spelled_sent = spell_checker.check(sent)
        checked_sent = spelled_sent.checked
        normalized_sent = repeat_normalize(checked_sent)
        for lownword in lownword_map:
            normalized_sent = normalized_sent.replace(lownword,
                                                      lownword_map[lownword])
        corpus.append(normalized_sent)
    return corpus
Example #15
0
def get_add_columns(record):  # 사업장명_R, 소재지전체주소_R, 주소1, 주소2, 주소3 column 추가
    co_name = record[1]
    co_addr = record[2]
    co_road_addr = record[3]

    co_name_r = re.sub(x, '%%', co_name)
    co_name_r = re.sub(w, '', co_name_r)
    name_list = co_name_r.split()

    for j in range(len(name_list)):
        rt = re.sub(p, " ", name_list[j])
        name_list[j] = rt

    co_name_r = ' '.join(name_list).strip()

    addr1 = ''
    addr2 = ''
    addr3 = ''

    if co_addr == '':
        co_addr = CA.get_old_addr(co_road_addr)
        record[2] = co_addr

    split_addr = co_addr.split()

    if len(split_addr) >= 3:
        addr3 = split_addr[2]

    if len(split_addr) >= 2:
        addr2 = split_addr[1]

    if len(split_addr) >= 1:
        addr1 = split_addr[0]

    co_addr_r = co_addr
    if '  ' in co_addr:
        co_addr_r = spacing(co_addr)

    if len(co_addr_r.split()) > 1:
        try:
            co_addr_r = AddrNormalization(co_addr_r.split())
        except:
            pass

    return [co_name_r, co_addr_r, addr1, addr2, addr3]
def regex_spacing_normalization(data):
    del_filter1 = re.compile(r'[!?,.ㅋㅎㅜㅠ가-힣0-9]+')
    data[SENTENCE_IDX_COLUME_NAME] = 0
    df = pd.DataFrame(
        columns=[TEXT_REVIEW_COLUMN_NAME, SENTENCE_IDX_COLUME_NAME])
    for idx, item in enumerate(data[TEXT_REVIEW_COLUMN_NAME]):
        tmp = str(item)
        if tmp == 'nan':
            continue
        tmp = ' '.join(del_filter1.findall(item))
        tmp = spacing(tmp)
        tmp = repeat_normalize(tmp, num_repeats=2)

        df = df.append(
            {
                TEXT_REVIEW_COLUMN_NAME: tmp,
                SENTENCE_IDX_COLUME_NAME: idx
            },
            ignore_index=True)
    return df
Example #17
0
def main(args=sys.argv[1:]):
    args = get_parser().parse_args(args)

    source = args.infile.read()
    
    limit = 198
    result = '\n'
    for line in source.splitlines():
        while len(line) > limit:
            result += spacing(line[0:limit])
            line = line[limit:]
        result += '\n'

    if args.overwrite:
        args.infile.close()
        with open(args.infile.name, 'w', encoding=args.infile.encoding) as f:
            f.write(result)
    else:
        args.outfile.write(result)

    return 0 if (source == result) else 1
Example #18
0
    def __init__(self, inputStr, permitChange=False):
        self.twitter = Twitter()
        self.inputStr = inputStr

        self.answerList = list()
        self.konlpyList = list()
        self.konlpyList2 = list()
        self.spacingList = list()

        self.spacingStr = ""
        self.changeThing = ""
        self._josa = ""

        self.koreanhandler = koreanHandler(self.inputStr)
        self.space_string, self.punctuation = self.koreanhandler.deleteJM()
        self.spacingStr = spacing(self.space_string)
        self.spacingList = self.spacingStr.split(" ")
        self.konlpyList = self.twitter.pos(self.spacingStr)
        self.answerList = self.spacingList

        self.permitChange = permitChange
Example #19
0
def detect_document(input_path, output_path, page_num):
    client = vision.ImageAnnotatorClient()
    with io.open(input_path, 'rb') as image_file:
        content = image_file.read()
        image = vision.types.Image(content=content)
        # response: json
        response = client.document_text_detection(image=image)

    with io.open(output_path, 'w', encoding='UTF-8') as text_file:
        # 페이지 번호 표시
        text_file.write('@@p' + str(page_num) + '\n')
        print(response.full_text_annotation.text)
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                for paragraph in block.paragraphs:
                    par_word = ''
                    prev_word = ''
                    is_lineWrappingBreak = False
                    for word in paragraph.words:
                        cur_word = ''
                        for symbol in word.symbols:
                            par_word += symbol.text
                            cur_word += symbol.text
                            break_type = symbol.property.detected_break.type
                            if break_type == 3:
                                is_lineWrappingBreak = True
                                prev_word = cur_word
                            elif break_type == 1:
                                if is_lineWrappingBreak == True:
                                    prev_idx = par_word.rfind(prev_word)
                                    unSpaced = par_word[prev_idx:]
                                    par_word = par_word[:prev_idx]
                                    cur_word = spacing(unSpaced)
                                    is_lineWrappingBreak = False
                                    par_word += cur_word
                                par_word += ' '
                    # 페이지 번호가 인식될 수 있다
                    if par_word != str(page_num):
                        text_file.write(par_word)
                        text_file.write('\n')
Example #20
0
async def make_txt_file(img_path, page_num, txt_path):
    response = await detect_document(img_path)
    print('start', img_path)
    data = ''
    page_str = '@@p' + str(page_num) + '\n'
    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                par_word = ''
                prev_word = ''
                is_line_wrapping_break = False
                for word in paragraph.words:
                    cur_word = ''
                    for symbol in word.symbols:
                        par_word += symbol.text
                        cur_word += symbol.text
                        break_type = symbol.property.detected_break.type
                        if break_type == 3:
                            is_line_wrapping_break = True
                            prev_word = cur_word
                        elif break_type == 1:
                            if is_line_wrapping_break:
                                prev_idx = par_word.rfind(prev_word)
                                un_spaced = par_word[prev_idx:]
                                par_word = par_word[:prev_idx]
                                cur_word = spacing(un_spaced)
                                is_line_wrapping_break = False
                                par_word += cur_word
                            par_word += ' '
                # remove page number
                if par_word != str(page_num):
                    data += par_word
                    data += '\n'

    if data == '':
        data = '빈 면\n'
    data = page_str + data
    with io.open(txt_path, 'w', encoding='UTF-8') as book_file:
        book_file.write(data)
    print('end', img_path)
Example #21
0
print(data_train)
print(data_test)

#띄어쓰기를 모두 제거
data_train['document'] = data_train['document'].replace(' ', '', regex = True) #문자열의 괄호 제거.
data_test['document'] = data_test['document'].replace(' ', '', regex = True) #문자열의 괄호 제거.

#띄어쓰기 전
print(data_train.head(10))
print(data_test.head(10))

#pykospacing을 통해 띄어쓰기 진행
for i in range(len(data_train)):
  if(type(data_train['document'][i])==float):
    data_train['document'][i]=""
  data_train['document'].iloc[i] = spacing(data_train['document'].iloc[i])
  if(i%500==0):
    print(i)
for i in range(len(data_test)):
  if(type(data_test['document'][i])==float):
    data_test['document'][i]=""
  data_test['document'].iloc[i] = spacing(data_test['document'].iloc[i])
  if(i%500==0):
    print(i)

#띄어쓰기 후
print(data_train)
print(len(data_train))
print(data_test)

#띄어쓰기 완료된 파일을 저장.
Example #22
0
    for val in data
]

# 대괄호 안에 저자의 이름이 나와있는 경우를 없애준다.
data = [re.sub('\[.+\]', '', val) for val in data]

warnings.filterwarnings("error")

spacing_ls = []

print(
    '### tuning about spacing is operating becuz of spacy and okt tuning process, it\'ll take over than 5min. '
)
for idx, val in enumerate(data):
    try:
        spacing_ls.append(','.join([spacing(val) for val in val.split(".")
                                    ]).replace(",", ''))
    except:
        spacing_ls.append(val)

data = spacing_ls.copy()

# # tokenization process

# - 정규식 표현을 통해서 문장 내에 이메일과 기타 특수 문자들을 없애주었지만, 여전히 난잡해보인다.
# - LDA 알고리즘을 사용하기 위해서는, 문장들을 단어들의 묶음으로 변환시켜주는 과정이 필요하다.
# - 이러한 과정을 Tokenization 이라고 한다.

data_words = [okt.nouns(val) for val in data]

Example #23
0
if __name__ == '__main__':
	while(True):
		inputStr = input("input: ")
		if inputStr == 'q':
			print("End")
			break

		print('-------------------------------')
		print('Before Jellypy : %s' % inputStr)
		# KoreanHandler class를 이용해 자음모음을 다 제거
		koreanhandler = koreanHandler(inputStr)
		space_string, punctuation = koreanhandler.deleteJM()

		# pykospacing 을 이용해 1차적으로 띄어쓰기를 함
		spacingStr = spacing(space_string)
		print('First preprocess : %s' % spacingStr)
		spacingList = spacingStr.split(" ")

		# konlpy Twitter를 이용해서 형태소 분석
		konlpyList = twitter.pos(spacingStr)
		changeThing = ""
		print('형태소 분석 결과 : %s' % konlpyList)
		konlpyList2 = list()

		answerList = list()
		answerList = spacingList

		_iter = 0	
		while _iter < len(konlpyList):
			if konlpyList[_iter][1] == 'Noun' or konlpyList[_iter][1]=='Adjective' or konlpyList[_iter][1]=='Exclamation':
Example #24
0
from soynlp.normalizer import *
print(repeat_normalize('휴일 마지막 깨알같이 촘촘히ㅋㅋㅋㅋ', num_repeats=1))

def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\^]', '',str(texts[i])) #remove punctuation
        review = re.sub(r'\s+', ' ', review) #remove extra space
        review = re.sub(r'<[^>]+>','',review) #remove Html tags
        review = re.sub(r'\s+', ' ', review) #remove spaces
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end
        review = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]+', '', review)
        corpus.append(review)
    return corpus

clean_text('휴일 마지막 !')

# 띄어쓰기 검사기 설치
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git

# 띄어쓰기 교정

from pykospacing import spacing
print('아빠가방에 들어가신다'+'\n'+spacing(str('아빠가방에 들어가신다')))

time.sleep(2000)

time.sleep(30000)

Example #25
0
def sent_spacing(datas):
    if type(datas) != list:
        datas = [datas]
    print(datas)
    return [spacing(d) for d in datas]
def spacing_text(sentence):
    return spacing(sentence)
 def auto_spacing(self, content):
     return spacing(content)
Example #28
0
 def _decode_to_string(self, tokens):
     text = ''.join(tokens)
     text = spacing(text)
     return text.strip()
Example #29
0
  try :
    spelled_sent = spell_checker.check(basic_preprocessed_corpus[i])
    checked_sent = spelled_sent.checked
    spelled_data.append(checked_sent)
  except:
    print("There is an error "+str(i)+ "index")
"""

# 띄어쓰기 교정

from pykospacing import spacing

spacing_preprocessed_corpus = []

for i in range(len(basic_preprocessed_corpus)):
  spacing_preprocessed_corpus.append(spacing(str(basic_preprocessed_corpus[i])))

# 확인
import random

index = random.randint(1, len(basic_preprocessed_corpus))
print("띄어쓰기 교정 전 \n====================\n{}\n".format(basic_preprocessed_corpus[index]))
print("띄어쓰기 교정 후 \n====================\n{}".format(spacing_preprocessed_corpus[index]))

len(spacing_preprocessed_corpus)

raw_data.columns =['Atelier', 'Review', 'rank']

final = raw_data[['Atelier', 'Review']]

final['Review'] = spacing_preprocessed_corpus
Example #30
0
import urllib.request
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import MaxScoreTokenizer
from soynlp.tokenizer import LTokenizer
# -*- coding: utf-8 -*-
from ckonlpy.tag import Twitter

from pykospacing import spacing

#띄어 쓰기 자동으로 해줌
sent = "위 인수들을 사용할 때 고려해야 될점이있습니다. audio 데이터의 어떤 시점에 하나의 단어가 언급되고 있다면 그 단어는 잘려서 이상하게 인식될 것입니다. 이 harvard 데이터는 실험 목적으로 녹음된 것이기 때문에 초 단위로 잘라도 단어가 잘리지 않은 것 입니다."
new_sent = sent.replace(" ", '')
print(new_sent)
kospacing_sent = spacing(new_sent)
print(sent)
print(kospacing_sent)

#특정 단어 명사로 설정
twitter = Twitter()
#twitter.add_dictionary('띄어쓰기', 'Noun')
print(twitter.morphs(kospacing_sent))