def create_hashtag_list(source_path, destination_path): data = pd.read_csv(source_path, sep=',', doublequote=True) result = list() for sentence in data.message: words = sent_tokenize(sentence, engine='attacut') hashtag = [word.strip() for word in words if word[0] == '#'] result.extend(hashtag) with open(destination_path, 'wb') as f: pickle.dump(result, f)
def reprocess_lines(processed_lines): """ Reprocesses lines using pythainlp to cut up sentences into shorter sentences. Many of the lines in BEST seem to be multiple Thai sentences concatenated, according to native Thai speakers. Input: a list of lines, where each line is a list of words. Space characters can be included as words Output: a new list of lines, resplit using pythainlp """ reprocessed_lines = [] for line in processed_lines: text = "".join(line) try: chunks = sent_tokenize(text) except NameError as e: raise NameError( "Sentences cannot be reprocessed without first installing pythainlp" ) from e # Check that the total text back is the same as the text in if sum(len(x) for x in chunks) != len(text): raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format( text, chunks)) chunk_lengths = [len(x) for x in chunks] current_length = 0 new_line = [] for word in line: if len(word) + current_length < chunk_lengths[0]: new_line.append(word) current_length = current_length + len(word) elif len(word) + current_length == chunk_lengths[0]: new_line.append(word) reprocessed_lines.append(new_line) new_line = [] chunk_lengths = chunk_lengths[1:] current_length = 0 else: remaining_len = chunk_lengths[0] - current_length new_line.append(word[:remaining_len]) reprocessed_lines.append(new_line) word = word[remaining_len:] chunk_lengths = chunk_lengths[1:] while len(word) > chunk_lengths[0]: new_line = [word[:chunk_lengths[0]]] reprocessed_lines.append(new_line) word = word[chunk_lengths[0]:] chunk_lengths = chunk_lengths[1:] new_line = [word] current_length = len(word) reprocessed_lines.append(new_line) return reprocessed_lines
def break_sentences_th(f): text = open(f).read() paragraphs = re.split('\n{2,}', text) paragraphs = [ re.sub('[\n]', '', x) for x in paragraphs if re.search('[ก-์]{10}', x) ] sentences = [x for p in paragraphs for x in pythainlp.sent_tokenize(p)] output_file = os.path.splitext(f)[0] + '.sent' with open(output_file, mode='w') as out: for s in sentences: #out.write(' '.join(pythainlp.tokenize.word_tokenize(s))) out.write(s) out.write('\n')
def main(): pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' originalImage = cv2.imread('image2.jpg') img = cv2.cvtColor(originalImage, cv2.COLOR_BGR2GRAY) cv2.imshow('Black white image', img) cv2.waitKey(0) cv2.destroyAllWindows() #custom_config = r'-l tha+eng --psm 6' custom_config = r'-l tha+eng --psm 6' text = pytesseract.image_to_string(img, config=custom_config) text = sent_tokenize(text, engine="whitespace+newline") print(text) print("-----------------------In main--------------------------\n")
def reprocess_lines(processed_lines): reprocessed_lines = [] for line in processed_lines: text = "".join(line) chunks = sent_tokenize(text) if sum(len(x) for x in chunks) != len(text): raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format( text, chunks)) chunk_lengths = [len(x) for x in chunks] current_length = 0 new_line = [] for word in line: if len(word) + current_length < chunk_lengths[0]: new_line.append(word) current_length = current_length + len(word) elif len(word) + current_length == chunk_lengths[0]: new_line.append(word) reprocessed_lines.append(new_line) new_line = [] chunk_lengths = chunk_lengths[1:] current_length = 0 else: remaining_len = chunk_lengths[0] - current_length new_line.append(word[:remaining_len]) reprocessed_lines.append(new_line) word = word[remaining_len:] chunk_lengths = chunk_lengths[1:] while len(word) > chunk_lengths[0]: new_line = [word[:chunk_lengths[0]]] reprocessed_lines.append(new_line) word = word[chunk_lengths[0]:] chunk_lengths = chunk_lengths[1:] new_line = [word] current_length = len(word) reprocessed_lines.append(new_line) return reprocessed_lines
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wl_nlp_utils.init_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) # Input of SudachiPy cannot be more than 49149 BYTES if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4: # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300) sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10) else: sections = wl_nlp_utils.split_into_chunks_text( text, section_size=main.settings_custom['files']['misc'] ['read_files_in_chunks']) for section in sections: # NLTK if sentence_tokenizer == 'nltk_punkt': lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', # English 'eng_gb': 'english', 'eng_us': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', # German 'deu_at': 'german', 'deu_de': 'german', 'deu_ch': 'german', 'ell': 'greek', 'ita': 'italian', # Norwegian 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', # Portuguese 'por_br': 'portuguese', 'por_pt': 'portuguese', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other languages 'other': 'english' } sentences.extend( nltk.sent_tokenize(section, language=lang_texts[lang])) # spaCy elif sentence_tokenizer.startswith('spacy_'): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(section) sentences.extend([sentence.text for sentence in doc.sents]) # Chinese & Japanese elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']: for line in section.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in [ '。', '!', '?', '!', '?' ]: for j, char_next in enumerate(line): if j > i and char_next not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Icelandic elif sentence_tokenizer == 'tokenizer_isl': for sentence in tokenizer.split_into_sentences(section): sentences.append( wl_word_detokenization.wl_word_detokenize( main, tokens=sentence.split(), lang='isl')) # Thai elif sentence_tokenizer == 'pythainlp_crfcut': sentences.extend(pythainlp.sent_tokenize(section)) # Tibetan elif sentence_tokenizer == 'botok_bod': wl_nlp_utils.init_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(section) for sentence_tokens in botok.sentence_tokenizer(tokens): sentences.append(''.join([ sentence_token.text for sentence_token in sentence_tokens['tokens'] ])) # Vietnamese elif sentence_tokenizer == 'underthesea_vie': sentences.extend(underthesea.sent_tokenize(section)) # Strip spaces sentences = [ sentence_non_empty for sentence in sentences if (sentence_non_empty := sentence.strip()) ]
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wl_text_utils.check_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) # NLTK if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'): lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', # Greek (Modern) 'ell': 'greek', 'ita': 'italian', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', 'por': 'portuguese', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other languages 'other': 'english' } sentences = nltk.sent_tokenize(text, language=lang_texts[lang]) # spaCy elif sentence_tokenizer == main.tr('spaCy - Sentencizer'): nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True sentences = [sentence.text for sentence in doc.sents] # syntok elif sentence_tokenizer == main.tr('syntok - Sentence Segmenter'): for para in syntok.segmenter.analyze(text): for sentence in para: sentences.append(''.join( [token.spacing + token.value for token in sentence])) # Chinese & Japanese elif sentence_tokenizer in [ main.tr('Wordless - Chinese Sentence Tokenizer'), main.tr('Wordless - Japanese Sentence Tokenizer') ]: for line in text.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in ['。', '!', '?', '!', '?']: for j, char in enumerate(line): if j > i and char not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Icelandic elif sentence_tokenizer == main.tr( 'Tokenizer - Icelandic Sentence Tokenizer'): for sentence in tokenizer.split_into_sentences(text): sentences.append( wl_word_detokenization.wl_word_detokenize( main, tokens=sentence.split(), lang='isl')) # Russian elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'): sentences = [sentence.text for sentence in razdel.sentenize(text)] # Thai elif sentence_tokenizer == main.tr('PyThaiNLP - CRFCut'): sentences = pythainlp.sent_tokenize(text) # Tibetan elif sentence_tokenizer == main.tr('botok - Tibetan Sentence Tokenizer'): wl_text_utils.check_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(text) for sentence_tokens in botok.sentence_tokenizer(tokens): sentences.append(''.join([ sentence_token.text for sentence_token in sentence_tokens[1] ])) # Vietnamese elif sentence_tokenizer == main.tr( 'Underthesea - Vietnamese Sentence Tokenizer'): sentences = underthesea.sent_tokenize(text) # Strip spaces sentences = [sentence.strip() for sentence in sentences] sentences = wl_text_utils.record_boundary_sentences(sentences, text) return sentences