Example #1
0
def sent_tokenize(text, senttok):
    proc_sent = ExternalTextProcessor(senttok.split(' '))
    # content = base64.b64decode(text).decode("utf-8").replace("\t", " ")
    content = text.replace("\t", " ")
    sents = proc_sent.process(content).strip()
    sents = [s.strip() for s in sents.split("\n") if s.strip()]
    return sents
def extract_encoded_text(encoded, sent_tokeniser, word_tokeniser,
                         morph_analyser):
    if not sent_tokeniser:
        return encoded

    content = base64.b64decode(encoded).decode("utf-8").replace("\t", " ")
    tokenized_segs = []
    seg = ""
    sent_tokeniser.writeline(html.escape(content.strip()) + "\n")
    while seg != "<P>":
        seg = sent_tokeniser.readline().strip()
        if seg != "" and seg != "<P>":
            tokenized_segs.append(html.unescape(seg))

    tokenized_filtered = []
    for sent in tokenized_segs:
        if sum([1 for m in sent if m in string.punctuation + string.digits
                ]) < len(sent) // 2:
            tokenized_filtered.append(sent)
    if not word_tokeniser:
        b64text = base64.b64encode(tokenized_filtered.lower().encode("utf-8"))
        return b64text.decode()

    tokenized_text = ""
    for sent in tokenized_filtered:
        word_tokeniser.writeline(sent)
        tokenized_text = tokenized_text + word_tokeniser.readline().strip(
        ) + "\n"
    if morph_analyser:
        proc_morph = ExternalTextProcessor(
            morph_analyser.split())  # Apertium does line buffering
        tokenized_text = proc_morph.process(tokenized_text)

    b64text = base64.b64encode(tokenized_text.lower().encode("utf-8"))
    return b64text.decode()
Example #3
0
def word_tokenize(sents, wordtok):
    proc_word = ExternalTextProcessor(wordtok.split(' '))
    ret = []
    for sent in sents:
        words = proc_word.process(sent)
        ret.append(words.strip())
    return ret
Example #4
0
def split_sentences(original_text, sentence_splitter_cmd):
    if sentence_splitter_cmd:
        proc = ExternalTextProcessor(sentence_splitter_cmd.split())
        text_split = proc.process(original_text.replace("\n\n", "\n"))
    else:
        text_split = original_text.replace("\n\n", "\n")

    output = html.unescape(text_split)

    return [n for n in output.split("\n") if filter_digits_and_punctuation(n)]
Example #5
0
def extract_encoded_text(encoded, sent_tokeniser, word_tokeniser,
                         morph_analyser):
    if not sent_tokeniser:
        return encoded

    proc_sent = ExternalTextProcessor(sent_tokeniser.split())
    content = base64.b64decode(encoded).decode("utf-8").replace("\t", " ")
    tokenized_segs = proc_sent.process(content).strip()
    tokenized_filtered = ""

    for sent in tokenized_segs.split("\n"):
        if sum([1 for m in sent if m in string.punctuation + string.digits
                ]) < len(sent) // 2:
            tokenized_filtered += sent + "\n"

    if not word_tokeniser:
        b64text = base64.b64encode(tokenized_filtered.lower().encode("utf-8"))
        return b64text.decode()

    proc_word = ExternalTextProcessor(word_tokeniser.split())
    tokenized_text = proc_word.process(tokenized_filtered)

    if morph_analyser:
        proc_morph = ExternalTextProcessor(morph_analyser.split())
        tokenized_text = proc_morph.process(tokenized_text)

    b64text = base64.b64encode(tokenized_text.lower().encode("utf-8"))
    return b64text.decode()
Example #6
0
def ngrams_from_text(n, hash_values, ignore_set, word_tokeniser_cmd, page):
    proc = ExternalTextProcessor(word_tokeniser_cmd.split(' '))
    segments = proc.process(page).split("\n")
    words = []
    for s in segments:
        words.extend(s.split(' '))
    ngrams = _ngram_helper(words, n, hash_values)

    if ignore_set:
        return [ng for ng in ngrams if ng not in ignore_set]

    return ngrams
Example #7
0
def extract_encoded_text(encodedtext, encodedtokenized, tmp_file,
                         tmp_file_origtext, sent_tokeniser):
    proc_sent = ExternalTextProcessor(sent_tokeniser.split(' '))
    content = base64.b64decode(encodedtext).decode("utf-8").replace("\t", " ")
    tokenized_segs = proc_sent.process(content).strip()
    tokenized_filtered = ""
    for sent in tokenized_segs.split("\n"):
        if sum([1 for m in sent if m in string.punctuation + string.digits
                ]) < len(sent) // 2:
            tokenized_filtered += sent + "\n"
    tmp_file_origtext.write(tokenized_filtered.encode())
    content_tokenized = base64.b64decode(encodedtokenized)
    tmp_file.write(content_tokenized)
def langsplit(uri, langsplit_exec, text):
    cmd = [langsplit_exec, "--printchunks"]
    proc = ExternalTextProcessor(cmd)
    tld = uri.split("/")[0].split(".")[-1]
    header = u"%s tld:%s uri:%s\n" % (magic_numer, tld, uri)
    output = proc.process(u"\n".join([header, text]))

    if not output.strip():
        res = langid.classify(text)
        lang = res[0]
        header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(), lang,
                                                  len(text.encode("utf-8")))
        return header + text
    return output
def langsplit(uri, langsplit_exec, text):
    cmd = [langsplit_exec, "--printchunks"]
    proc = ExternalTextProcessor(cmd)
    tld = uri.split("/")[0].split(".")[-1]
    header = u"%s tld:%s uri:%s\n" % (magic_numer, tld, uri)
    output = proc.process(u"\n".join([header, text]))

    if not output.strip():
        res = langid.classify(text)
        lang = res[0]
        header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(),
                                                  lang,
                                                  len(text.encode("utf-8")))
        return header + text
    return output
def split_sentences(text, sentence_splitter_cmd, lang):
    if not sentence_splitter_cmd:
        return text.split('\n')

    sentences = []
    proc = ExternalTextProcessor([sentence_splitter_cmd, "-l", lang])
    output = proc.process(text.replace("\n", "\n\n"))

    for line in output.split("\n"):
        line = line.strip()
        if not line or line == "<P>":
            continue
        sentences.append(line)

    return sentences
def split_sentences(text, sentence_splitter_cmd, lang):
    if not sentence_splitter_cmd:
        return text.split('\n')

    sentences = []
    proc = ExternalTextProcessor([sentence_splitter_cmd, "-l", lang])
    output = proc.process(text.replace("\n", "\n\n"))

    for line in output.split("\n"):
        line = line.strip()
        if not line or line == "<P>":
            continue
        sentences.append(line)

    return sentences
Example #12
0
def langsplit(uri, text,
              langsplit_exec="/home/buck/net/build/mtma_bitext/html_convert/langsplit"):
    cmd = [langsplit_exec, "--printchunks"]
    proc = ExternalTextProcessor(cmd)
    tld = uri.split("/")[0].split(".")[-1]
    header = u"%s tld:%s uri:%s\n" % (magic_number, tld, uri)
    output = proc.process(u"\n".join([header, text]))

    if not output.strip():
        import langid
        res = langid.classify(text)
        lang = res[0]
        header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(),
                                                  lang,
                                                  len(text.encode("utf-8")))
        return header + text
    return output
Example #13
0
def split_sentences(original_text, sentence_splitter_cmd, prune_type,
                    prune_threshold):
    # print("original_text", len(original_text))
    proc = ExternalTextProcessor(sentence_splitter_cmd.split())

    tmp1 = original_text.replace("\n\n", "\n")
    # print("tmp1", len(tmp1))

    tmp2 = proc.process(tmp1)
    # print("tmp2", len(tmp2))

    tmp3 = html.unescape(tmp2)
    # print("tmp3", len(tmp3))

    tmp4 = [n for n in tmp3.split("\n") if filter_digits_and_punctuation(n)]
    # print("tmp4", len(tmp4))

    tmp5 = []
    count = 0
    for extracted_line in tmp4:
        extracted_line = extracted_line.strip()

        if not extracted_line:
            # print("empty line")
            continue

        if prune_type == "chars":
            if len(extracted_line) > prune_threshold:
                continue
        elif prune_type == "words":
            if len(extracted_line.split()) > prune_threshold:
                continue

        tmp5.append(extracted_line)

        count += 1
    # print("tmp5", len(tmp5))

    return tmp5
Example #14
0
def write_sentences(html, lang, sent_tokenizer, outfile, lid=None):
    html = base64.b64decode(html).decode("utf8")
    tagchunks = parsers.parse(html, lang).split("\n")
    chunks = [
        re_space.sub(" ", tc).strip() for tc in tagchunks
        if not re_tag.match(tc.strip())
    ]
    proc_sent = ExternalTextProcessor(sent_tokenizer.split(' '))
    dedup = set()
    n_sents = 0
    for chunk in chunks:
        if chunk.strip():
            if lid:
                pred = lid.predict([chunk])[0]
                if pred[0][0][9:] != lang:
                    continue
            tokenized_segs = proc_sent.process(chunk).strip()
            for sent in tokenized_segs.split("\n"):
                if sent not in dedup:
                    print(sent, file=outfile)
                    dedup.add(sent)
                    n_sents += 1
    return n_sents
Example #15
0
def get_tokenizer(cmd, lang="en"):
    if cmd == "moses":
        return MosesTokenizer(lang)
    elif cmd == "mecab":
        tagger = MeCab.Tagger("-Owakati")

        def mecab(text):
            return tagger.parse(text).strip().split()
        return mecab
    else:
        proc = ExternalTextProcessor(cmd.split())

        def external(text):
            return proc.process(text).strip().split()
        return external
def extract_encoded_text(encodedtext, tmp_file, tmp_file_origtext, morphanal,
                         sent_tokeniser, word_tokeniser):
    proc_sent = ExternalTextProcessor(sent_tokeniser.split(' '))
    proc_word = ExternalTextProcessor(word_tokeniser.split(' '))
    content = base64.b64decode(encodedtext).decode("utf-8").replace("\t", " ")
    tokenized_segs = proc_sent.process(content).strip()
    tmp_file_origtext.write(tokenized_segs.encode())
    tokenized_text = proc_word.process(tokenized_segs)

    if morphanal is not None:
        morphanalyser = ["/bin/bash", morphanal]
        tokenized_text = run_analyse(morphanalyser, tokenized_text)
    tmp_file.write(tokenized_text.lower().encode())
def normalize(text, normalizer_cmd, lang):
    proc = ExternalTextProcessor([normalizer_cmd, lang])
    output = proc.process(text.strip())
    return output
def tokenize(text, tokenizer_cmd, lang):
    proc = ExternalTextProcessor([tokenizer_cmd, "-a", "-l", lang])
    output = proc.process(text.strip())
    return output
def tokenize(text, tokenizer_cmd, lang):
    proc = ExternalTextProcessor([tokenizer_cmd, "-a", "-l", lang])
    output = proc.process(text.strip())
    return output
def normalize(text, normalizer_cmd, lang):
    proc = ExternalTextProcessor([normalizer_cmd, lang])
    output = proc.process(text.strip())
    return output
Example #21
0
def split_sentences(text, sentence_splitter_cmd, lang):
    proc = ExternalTextProcessor([sentence_splitter_cmd, "-l", lang])
    output = proc.process(text.replace("\n\n", "\n"))

    return [n for n in output.split("\n") if filter_digits_and_punctuation(n)]
Example #22
0
                        help='url to English text',
                        type=argparse.FileType('r'))
    parser.add_argument('-url2fr',
                        help='url to French text',
                        type=argparse.FileType('r'))
    parser.add_argument('-write',
                        help='filename for pickle file',
                        type=argparse.FileType('wb'))
    args = parser.parse_args()

    source_tokenizer = None
    if args.source_tokenizer:
        if args.source_tokenizer == 'WordPunctTokenizer':
            source_tokenizer = WordPunctTokenizer()
        else:
            source_tokenizer = ExternalTextProcessor(args.source_tokenizer)
    target_tokenizer = None
    if args.target_tokenizer:
        if args.target_tokenizer == 'WordPunctTokenizer':
            target_tokenizer = WordPunctTokenizer()
        else:
            target_tokenizer = ExternalTextProcessor(args.target_tokenizer)

    # read source and target corpus
    s, t = read_lett(args.lettfile, args.slang, args.tlang, source_tokenizer,
                     target_tokenizer, False, args.url2fr, args.url2en, True)

    sys.stderr.write(
        "Read %d %s docs and %d %s docs from %s\n" %
        (len(s), args.slang, len(t), args.tlang, args.lettfile.name))
    sys.stderr.write("Source stats: ")
Example #23
0
            ) != 0 and options.morphanal2 is not None and lang == options.lang2:
                morphanalyser = ["/bin/bash", options.morphanal2]
                tpmorph = subprocess.Popen(morphanalyser,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE,
                                           stdin=subprocess.PIPE)
                morph_stdout, error = tpmorph.communicate(input=text)
                if len(error.strip()) == 0:
                    text = re.sub(
                        r"\^\*?", r"",
                        re.sub(r"[/<][^$]*\$", r"",
                               morph_stdout.decode("utf-8")))

            # Getting the bag of words in the document
            if lang == options.lang1:
                proc = ExternalTextProcessor(options.wordtokeniser1.split(' '))
            elif lang == options.lang2:
                proc = ExternalTextProcessor(options.wordtokeniser2.split(' '))

            sorted_uniq_wordlist = set(proc.process(text).lower().split())
            # Trimming non-aplphanumerics:
            clean_sorted_uniq_wordlist = [
                _f
                for _f in [w.strip(punctuation) for w in sorted_uniq_wordlist]
                if _f
            ]
            sorted_uniq_wordlist = clean_sorted_uniq_wordlist

            for word in sorted_uniq_wordlist:
                if lang in word_map:
                    if word in word_map[lang]: