def extract_file_noun(input, output): input_file = open(input, mode='r', encoding='utf-8') open(output, mode='w', encoding='utf-8') output_file = open(output, mode='a', encoding='utf-8') line_number = 1 while (True): line = input_file.readline() if not line: break line = line.strip() for line_array in line.split("\n"): sentences = nltkSentTokenizer(line_array) sentence_words = [] for sent in sentences: word_list = expect_nng_text(sent) if len(word_list): for word in word_list: if util.check_email(word) or util.is_int( word) or util.is_alpha(word): continue else: output_file.write(word + os.linesep) sentence_words.append(word) # print(line_number, word) print(line_number, sentence_words) line_number += 1
def extract_file_noun(input, output): input_file = open(input, mode='r', encoding='utf-8') open(output, mode='w', encoding='utf-8') output_file = open(output, mode='a', encoding='utf-8') line_number = 1 while (True): line = input_file.readline() if not line: break line = line.strip() for line_array in line.split("\n"): sentences = nltkSentTokenizer(line_array) sentence_words = [] for sent in sentences: word_list = expect_noun_text(sent) if len(word_list): for word in word_list: if util.check_email(word): continue else: add_flag = True for char in word: if char in [ "‘", "`", ",", "'", "\"", "|", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "-", "_", "=", "+", "<", ">", ".", ";", ":", "ㄱ", "ㄴ", "ㄲ", "ㅂ", "ㅃ", "ㅈ", "ㅉ", "ㄷ", "ㄸ", "ㄱ", "ㅁ", "ㅇ", "ㄹ", "ㅎ", "ㅅ", "ㅆ", "ㅍ", "ㅊ", "ㅌ", "ㅋ", "ㅛ", "ㅕ", "ㅑ", "ㅐ", "ㅔ", "ㅗ", "ㅓ", "ㅏ", "ㅣ", "ㅠ", "ㅜ", "ㅡ" ]: add_flag = False if word == '기자' or word == str( date.today().day) + '일': add_flag = False if add_flag: output_file.write(word + os.linesep) sentence_words.append(word) # print(line_number, word) print(line_number, sentence_words) line_number += 1
def extract_mecab_multi_noun(text, item_counter=0): text = text.strip() multi_noun = [] multi_noun_score = {} krword_rank_noun = [] krword_rank_noun_score = {} krword_rank_once_noun = [] krword_rank_once_noun_score = {} if text: sentence_list = nltkSentTokenizer(text) # print(sentence_list) for sentence in sentence_list: sentence = sentence.strip() if sentence: first_multi_noun_list, _ = expect_multi_noun_text_ko(sentence) first_single_noun_list, _ = expect_single_noun_text_ko( sentence) first_multi_noun_list.extend(first_single_noun_list) # print("f", first_single_noun_list) # print("f", first_multi_noun_list) second_multi_noun_list, second_multi_noun_list_score = cleaning_multi_noun( first_multi_noun_list, cleaning_count=2) # second_multi_noun_list, second_multi_noun_list_score = check_stopword(second_multi_noun_list, second_multi_noun_list_score) # print("origin : ", sentence) # print(second_multi_noun_list, second_multi_noun_list_score) multi_noun.extend(second_multi_noun_list) multi_noun_score.update(second_multi_noun_list_score) krword_rank_noun, krword_rank_noun_score = krwordrank_noun( sentence_list=sentence_list, min_count=5) krword_rank_once_noun, krword_rank_once_noun_score = krwordrank_noun( sentence_list=sentence_list, min_count=2) # print(multi_noun, multi_noun_score) # print(krword_rank_noun, krword_rank_noun_score) # print(krword_rank_once_noun, krword_rank_once_noun_score) multi_noun.extend(krword_rank_noun) multi_noun_score.update(krword_rank_noun_score) # multi_noun = multi_noun.extend(krword_rank_once_noun) # print(multi_noun, multi_noun_score) # print("-" * 100) multi_noun, multi_noun_score = check_stopword(multi_noun, multi_noun_score) # krword_rank_noun, krword_rank_noun_score = check_stopword(krword_rank_noun, krword_rank_noun_score) krword_rank_once_noun, krword_rank_once_noun_score = check_stopword( krword_rank_once_noun, krword_rank_once_noun_score) # print(multi_noun, multi_noun_score) multi_noun, multi_noun_score = remove_last_one_char( multi_noun, multi_noun_score) # krword_rank_noun, krword_rank_noun_score = remove_last_one_char(krword_rank_noun, krword_rank_noun_score) # krword_rank_noun, krword_rank_noun_score = remove_last_one_char(krword_rank_noun, krword_rank_noun_score) # print(multi_noun, multi_noun_score) # print(krword_rank_noun, krword_rank_noun_score) # print(krword_rank_once_noun, krword_rank_once_noun_score) multi_noun, multi_noun_score = check_stopword(multi_noun, multi_noun_score) # print("0" * 100) # print(multi_noun_score) # print(krword_rank_once_noun_score) multi_noun, multi_noun_score = multi_noun_score_add( multi_noun_score, krword_rank_once_noun_score) # print("0" * 100) # print(multi_noun, multi_noun_score) multi_noun, multi_noun_score = remove_stopword(multi_noun, multi_noun_score) # print("0" * 100) # print(multi_noun, multi_noun_score) # print(multi_noun_score) return_multi_noun, return_multi_noun_score = text_in_mult_noun_finder( multi_noun, multi_noun_score, text) if item_counter == 0: return return_multi_noun, return_multi_noun_score else: return return_multi_noun[:item_counter], dict( itertools.islice(return_multi_noun_score.items(), item_counter))
def extract_file_noun(input, output, time_interval=0): output_file = open(output, mode='w', encoding='utf-8') output_file.close() line_number = 1 input_file = open(input, mode='r', encoding='utf-8') while True: line = input_file.readline() if len(line) < 2: break line = line.strip() line = util.remove_naver_news(line) line = util.remove_http_tag(line) line = util.normalize(line) for line_array in line.split("\n"): sentences = nltkSentTokenizer(line_array) sentence_words = [] # for sent in sentences: # sent = sentences.replace('.', ' ') # sent = sentences.replace(',', ' ') sent = line_array.replace(' ', ' ') if len(sent.strip()) == 0: continue # print('sent:', sent) word_list = kakao_postagger_nn_finder(sent) # print('word_list:', word_list) # eng_word_list = re.findall('[A-Za-z]+', sent) # print('eng_word:', eng_word_list) # word_list = word_list + eng_word_list # print('word+eng_list:', word_list) if len(word_list): for word in word_list: # if util.check_email(word) or util.is_int(word) or util.is_alpha(word): if util.check_email(word): continue else: if word.startswith(".") or word.startswith( ",") or word.startswith( "!") or word.startswith("?"): word = word[1:] if word.endswith(".") or word.endswith( ",") or word.endswith("!") or word.endswith( "?"): word = word[:-1] one_korea_char = [ 'ㅂ', 'ㅈ', 'ㄷ', 'ㄱ', 'ㅅ', 'ㅛ', 'ㅛ', 'ㅕ', 'ㅑ', 'ㅐ', 'ㅔ', 'ㅃ', 'ㅉ', 'ㄸ', 'ㄲ', 'ㅆ', 'ㅒ', 'ㅖ', 'ㅁ', 'ㄴ', 'ㅇ', 'ㄹ', 'ㅎ', 'ㅗ', 'ㅓ', 'ㅏ', 'ㅣ', 'ㅋ', 'ㅌ', 'ㅊ', 'ㅍ', 'ㅠ', 'ㅜ', 'ㅡ' ] matching = [s for s in one_korea_char if s in word] if len(matching) > 0: # print("-" * 100) # print(len(matching)) # print(matching) # print("-" * 100) continue if str(sent).find(word) < 0: continue output_file = open(output, mode='a', encoding='utf-8') output_file.write(word + os.linesep) output_file.close() sentence_words.append(word) # print(line_number, word) del word_list time.sleep(time_interval) print(line_number, sentence_words) gc.enable() gc.collect() line_number += 1 # while True: # line = input_file.readline() # if line_number < 1000 : # line_number = line_number + 1 # continue # # if len(line) < 2 or line_number > 2000: # break; # # line = line.strip() # line = util.remove_naver_news(line) # line = util.remove_http_tag(line) # line = util.normalize(line) # # for line_array in line.split("\n"): # sentences = nltkSentTokenizer(line_array) # # sentence_words = [] # for sent in sentences: # sent = sent.replace('.', ' ') # sent = sent.replace(',', ' ') # sent = sent.replace(' ', ' ') # if len(sent.strip()) == 0: # continue # # print('sent:', sent) # word_list = kakao_postagger_nn_finder(sent) # # print('word_list:', word_list) # # eng_word_list = re.findall('[A-Za-z]+', sent) # # print('eng_word:', eng_word_list) # # word_list = word_list + eng_word_list # # print('word+eng_list:', word_list) # # if len(word_list): # for word in word_list: # # if util.check_email(word) or util.is_int(word) or util.is_alpha(word): # if util.check_email(word): # continue # else: # if word.startswith(".") or word.startswith(",") or word.startswith( # "!") or word.startswith("?"): # word = word[1:] # if word.endswith(".") or word.endswith(",") or word.endswith("!") or word.endswith("?"): # word = word[:-1] # # one_korea_char = ['ㅂ', 'ㅈ', 'ㄷ', 'ㄱ', 'ㅅ', 'ㅛ', 'ㅛ', 'ㅕ', 'ㅑ', 'ㅐ', 'ㅔ', # 'ㅃ', 'ㅉ', 'ㄸ', 'ㄲ', 'ㅆ', 'ㅒ', 'ㅖ', # 'ㅁ', 'ㄴ', 'ㅇ', 'ㄹ', 'ㅎ', 'ㅗ', 'ㅓ', 'ㅏ', 'ㅣ', # 'ㅋ', 'ㅌ', 'ㅊ', 'ㅍ', 'ㅠ', 'ㅜ', 'ㅡ'] # matching = [s for s in one_korea_char if s in word] # if len(matching) > 0: # # print("-" * 100) # # print(len(matching)) # # print(matching) # # print("-" * 100) # continue # # if str(sent).find(word) < 0: # continue # output_file = open(output, mode='a', encoding='utf-8') # output_file.write(word + os.linesep) # output_file.close() # # sentence_words.append(word) # # print(line_number, word) # del word_list # # time.sleep(time_interval) # print('2', line_number, sentence_words) # # gc.enable() # gc.collect() # line_number += 1 input_file.close()
def extract_file_noun(input, output): input_file = open(input, mode='r', encoding='utf-8') open(output, mode='w', encoding='utf-8') line_number = 1 yesterday_day = int((datetime.now() - timedelta(days=1)).strftime('%d')) while (True): line = input_file.readline() if not line: break line = line.strip() for line_array in line.split("\n"): sentences = nltkSentTokenizer(line_array) sentence_words = [] for sent in sentences: word_list = extractnoun.findKoNoun(sent) if len(word_list[0]): for word in word_list[0]: word = word.strip() add_flag = True for char in word: if char in [ "‘", "`", ",", "'", "\"", "\\", "|", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "※", "~", "-", "_", "=", "+", "<", ">", ".", ";", ":", "ㄱ", "ㄴ", "ㄲ", "ㅂ", "ㅃ", "ㅈ", "ㅉ", "ㄷ", "ㄸ", "ㄱ", "ㅁ", "ㅇ", "ㄹ", "ㅎ", "ㅅ", "ㅆ", "ㅍ", "ㅊ", "ㅌ", "ㅋ", "ㅛ", "ㅕ", "ㅑ", "ㅐ", "ㅔ", "ㅗ", "ㅓ", "ㅏ", "ㅣ", "ㅠ", "ㅜ", "ㅡ" ]: add_flag = False if add_flag and len(word) < 4 \ and (not word.endswith('니다') \ and not word.endswith('그후로') \ and not word.endswith('가요') \ and not word.endswith('고요') \ and not word.endswith('구요') \ and not word.endswith('나요') \ and not word.endswith('다요') \ and not word.endswith('마요') \ and not word.endswith('바요') \ and not word.endswith('사요') \ and not word.endswith('어요') \ and not word.endswith('자요') \ and not word.endswith('차요') \ and not word.endswith('타요') \ and not word.endswith('해요') \ and not word.endswith('세요') \ and not word.endswith('네요') \ and not word.endswith('케요') \ and not word.endswith('군요') \ and not word.endswith('하') \ and not word.endswith('텐데') \ and not word.endswith('건데') \ and not word.endswith('을려') \ and not word.endswith('을껄') \ and not word.endswith('습니') \ and not word.endswith('씁니') \ and not word.endswith('좀') \ and not word.endswith('처럼') \ and not word.endswith('된') \ and not word.endswith('나') \ and not word.endswith('넣') \ and not word.endswith('먹') \ and not word.endswith('있') \ and not word.endswith('볼라') \ and not word.endswith('…') \ and not word.endswith('비트코') \ and not word.endswith('기자') \ and not word.endswith('할') \ and not word.endswith('위안삼') \ and not word == '기자' \ and not word == str(yesterday_day) + '일' ): sentence_words.append(word) output_file = open(output, mode='a', encoding='utf-8') for word in sentence_words: output_file.write(word + os.linesep) output_file.close() print(line_number, sentence_words) line_number += 1
line_number = 1 while (True): text = input_file.readline() if not text: break text = remove_keyboard_out_chractor(text) text = remove_naver_news(text) if len(re.findall('function', text)) > 1 or len( re.findall('var currentDateParam', text)) > 0: continue line_sentence = [] for text_item in text.split("\n"): sentences = nltkSentTokenizer(text_item) for sent in sentences: for line in sent.split(r'\n'): if line.strip(): line = ut.normalizeText(line) line_sentence.append(line) if len(line_sentence) < 3: continue print(line_number, line_sentence[:-2]) line_number = line_number + 1 contents = (" ".join(line_sentence[1:-2]).strip())