Ejemplo n.º 1
0
def create_hashtag_list(source_path, destination_path):
    data = pd.read_csv(source_path, sep=',', doublequote=True)
    result = list()
    for sentence in data.message:
        words = sent_tokenize(sentence, engine='attacut')
        hashtag = [word.strip() for word in words if word[0] == '#']
        result.extend(hashtag)
    with open(destination_path, 'wb') as f:
        pickle.dump(result, f)
def reprocess_lines(processed_lines):
    """
    Reprocesses lines using pythainlp to cut up sentences into shorter sentences.

    Many of the lines in BEST seem to be multiple Thai sentences concatenated, according to native Thai speakers.

    Input: a list of lines, where each line is a list of words.  Space characters can be included as words
    Output: a new list of lines, resplit using pythainlp
    """
    reprocessed_lines = []
    for line in processed_lines:
        text = "".join(line)
        try:
            chunks = sent_tokenize(text)
        except NameError as e:
            raise NameError(
                "Sentences cannot be reprocessed without first installing pythainlp"
            ) from e
        # Check that the total text back is the same as the text in
        if sum(len(x) for x in chunks) != len(text):
            raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format(
                text, chunks))

        chunk_lengths = [len(x) for x in chunks]

        current_length = 0
        new_line = []
        for word in line:
            if len(word) + current_length < chunk_lengths[0]:
                new_line.append(word)
                current_length = current_length + len(word)
            elif len(word) + current_length == chunk_lengths[0]:
                new_line.append(word)
                reprocessed_lines.append(new_line)
                new_line = []
                chunk_lengths = chunk_lengths[1:]
                current_length = 0
            else:
                remaining_len = chunk_lengths[0] - current_length
                new_line.append(word[:remaining_len])
                reprocessed_lines.append(new_line)
                word = word[remaining_len:]
                chunk_lengths = chunk_lengths[1:]
                while len(word) > chunk_lengths[0]:
                    new_line = [word[:chunk_lengths[0]]]
                    reprocessed_lines.append(new_line)
                    word = word[chunk_lengths[0]:]
                    chunk_lengths = chunk_lengths[1:]
                new_line = [word]
                current_length = len(word)
        reprocessed_lines.append(new_line)
    return reprocessed_lines
Ejemplo n.º 3
0
def break_sentences_th(f):
    text = open(f).read()
    paragraphs = re.split('\n{2,}', text)
    paragraphs = [
        re.sub('[\n]', '', x) for x in paragraphs if re.search('[ก-์]{10}', x)
    ]
    sentences = [x for p in paragraphs for x in pythainlp.sent_tokenize(p)]
    output_file = os.path.splitext(f)[0] + '.sent'
    with open(output_file, mode='w') as out:
        for s in sentences:
            #out.write(' '.join(pythainlp.tokenize.word_tokenize(s)))
            out.write(s)
            out.write('\n')
Ejemplo n.º 4
0
def main():
    pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

    originalImage = cv2.imread('image2.jpg')
    img = cv2.cvtColor(originalImage, cv2.COLOR_BGR2GRAY)
    cv2.imshow('Black white image', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    #custom_config = r'-l tha+eng --psm 6'
    custom_config = r'-l tha+eng --psm 6'
    text = pytesseract.image_to_string(img, config=custom_config)

    text = sent_tokenize(text, engine="whitespace+newline")

    print(text)
    print("-----------------------In main--------------------------\n")
Ejemplo n.º 5
0
def reprocess_lines(processed_lines):
    reprocessed_lines = []
    for line in processed_lines:
        text = "".join(line)
        chunks = sent_tokenize(text)
        if sum(len(x) for x in chunks) != len(text):
            raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format(
                text, chunks))

        chunk_lengths = [len(x) for x in chunks]

        current_length = 0
        new_line = []
        for word in line:
            if len(word) + current_length < chunk_lengths[0]:
                new_line.append(word)
                current_length = current_length + len(word)
            elif len(word) + current_length == chunk_lengths[0]:
                new_line.append(word)
                reprocessed_lines.append(new_line)
                new_line = []
                chunk_lengths = chunk_lengths[1:]
                current_length = 0
            else:
                remaining_len = chunk_lengths[0] - current_length
                new_line.append(word[:remaining_len])
                reprocessed_lines.append(new_line)
                word = word[remaining_len:]
                chunk_lengths = chunk_lengths[1:]
                while len(word) > chunk_lengths[0]:
                    new_line = [word[:chunk_lengths[0]]]
                    reprocessed_lines.append(new_line)
                    word = word[chunk_lengths[0]:]
                    chunk_lengths = chunk_lengths[1:]
                new_line = [word]
                current_length = len(word)
        reprocessed_lines.append(new_line)
    return reprocessed_lines
Ejemplo n.º 6
0
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wl_nlp_utils.init_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    # Input of SudachiPy cannot be more than 49149 BYTES
    if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4:
        # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
        sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10)
    else:
        sections = wl_nlp_utils.split_into_chunks_text(
            text,
            section_size=main.settings_custom['files']['misc']
            ['read_files_in_chunks'])

    for section in sections:
        # NLTK
        if sentence_tokenizer == 'nltk_punkt':
            lang_texts = {
                'ces': 'czech',
                'dan': 'danish',
                'nld': 'dutch',
                # English
                'eng_gb': 'english',
                'eng_us': 'english',
                'est': 'estonian',
                'fin': 'finnish',
                'fra': 'french',
                # German
                'deu_at': 'german',
                'deu_de': 'german',
                'deu_ch': 'german',
                'ell': 'greek',
                'ita': 'italian',
                # Norwegian
                'nob': 'norwegian',
                'nno': 'norwegian',
                'pol': 'polish',
                # Portuguese
                'por_br': 'portuguese',
                'por_pt': 'portuguese',
                'rus': 'russian',
                'slv': 'slovene',
                'spa': 'spanish',
                'swe': 'swedish',
                'tur': 'turkish',
                # Other languages
                'other': 'english'
            }

            sentences.extend(
                nltk.sent_tokenize(section, language=lang_texts[lang]))
        # spaCy
        elif sentence_tokenizer.startswith('spacy_'):
            # Chinese, English, German, Portuguese
            if not lang.startswith('srp_'):
                lang = wl_conversion.remove_lang_code_suffixes(main, lang)

            nlp = main.__dict__[f'spacy_nlp_{lang}']
            doc = nlp(section)

            sentences.extend([sentence.text for sentence in doc.sents])
        # Chinese & Japanese
        elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']:
            for line in section.splitlines():
                sentence_start = 0

                for i, char in enumerate(line):
                    if i >= sentence_start and char in [
                            '。', '!', '?', '!', '?'
                    ]:
                        for j, char_next in enumerate(line):
                            if j > i and char_next not in [
                                    '。', '!', '?', '!', '?', '’', '”', ')', ')'
                            ]:
                                sentences.append(line[sentence_start:j])

                                sentence_start = j

                                break

                if sentence_start <= len(line):
                    sentences.append(line[sentence_start:])
        # Icelandic
        elif sentence_tokenizer == 'tokenizer_isl':
            for sentence in tokenizer.split_into_sentences(section):
                sentences.append(
                    wl_word_detokenization.wl_word_detokenize(
                        main, tokens=sentence.split(), lang='isl'))
        # Thai
        elif sentence_tokenizer == 'pythainlp_crfcut':
            sentences.extend(pythainlp.sent_tokenize(section))
        # Tibetan
        elif sentence_tokenizer == 'botok_bod':
            wl_nlp_utils.init_word_tokenizers(main, lang='bod')

            tokens = main.botok_word_tokenizer.tokenize(section)

            for sentence_tokens in botok.sentence_tokenizer(tokens):
                sentences.append(''.join([
                    sentence_token.text
                    for sentence_token in sentence_tokens['tokens']
                ]))
        # Vietnamese
        elif sentence_tokenizer == 'underthesea_vie':
            sentences.extend(underthesea.sent_tokenize(section))

    # Strip spaces
    sentences = [
        sentence_non_empty for sentence in sentences
        if (sentence_non_empty := sentence.strip())
    ]
Ejemplo n.º 7
0
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wl_text_utils.check_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    # NLTK
    if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'):
        lang_texts = {
            'ces': 'czech',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'est': 'estonian',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            # Greek (Modern)
            'ell': 'greek',
            'ita': 'italian',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'pol': 'polish',
            'por': 'portuguese',
            'rus': 'russian',
            'slv': 'slovene',
            'spa': 'spanish',
            'swe': 'swedish',
            'tur': 'turkish',
            # Other languages
            'other': 'english'
        }

        sentences = nltk.sent_tokenize(text, language=lang_texts[lang])
    # spaCy
    elif sentence_tokenizer == main.tr('spaCy - Sentencizer'):
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        sentences = [sentence.text for sentence in doc.sents]
    # syntok
    elif sentence_tokenizer == main.tr('syntok - Sentence Segmenter'):
        for para in syntok.segmenter.analyze(text):
            for sentence in para:

                sentences.append(''.join(
                    [token.spacing + token.value for token in sentence]))
    # Chinese & Japanese
    elif sentence_tokenizer in [
            main.tr('Wordless - Chinese Sentence Tokenizer'),
            main.tr('Wordless - Japanese Sentence Tokenizer')
    ]:
        for line in text.splitlines():
            sentence_start = 0

            for i, char in enumerate(line):
                if i >= sentence_start and char in ['。', '!', '?', '!', '?']:
                    for j, char in enumerate(line):
                        if j > i and char not in [
                                '。', '!', '?', '!', '?', '’', '”', ')', ')'
                        ]:
                            sentences.append(line[sentence_start:j])

                            sentence_start = j

                            break

            if sentence_start <= len(line):
                sentences.append(line[sentence_start:])
    # Icelandic
    elif sentence_tokenizer == main.tr(
            'Tokenizer - Icelandic Sentence Tokenizer'):
        for sentence in tokenizer.split_into_sentences(text):
            sentences.append(
                wl_word_detokenization.wl_word_detokenize(
                    main, tokens=sentence.split(), lang='isl'))

    # Russian
    elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'):
        sentences = [sentence.text for sentence in razdel.sentenize(text)]
    # Thai
    elif sentence_tokenizer == main.tr('PyThaiNLP - CRFCut'):
        sentences = pythainlp.sent_tokenize(text)
    # Tibetan
    elif sentence_tokenizer == main.tr('botok - Tibetan Sentence Tokenizer'):
        wl_text_utils.check_word_tokenizers(main, lang='bod')
        tokens = main.botok_word_tokenizer.tokenize(text)

        for sentence_tokens in botok.sentence_tokenizer(tokens):
            sentences.append(''.join([
                sentence_token.text for sentence_token in sentence_tokens[1]
            ]))
    # Vietnamese
    elif sentence_tokenizer == main.tr(
            'Underthesea - Vietnamese Sentence Tokenizer'):
        sentences = underthesea.sent_tokenize(text)

    # Strip spaces
    sentences = [sentence.strip() for sentence in sentences]

    sentences = wl_text_utils.record_boundary_sentences(sentences, text)

    return sentences