def sent_tokenize(text, senttok): proc_sent = ExternalTextProcessor(senttok.split(' ')) # content = base64.b64decode(text).decode("utf-8").replace("\t", " ") content = text.replace("\t", " ") sents = proc_sent.process(content).strip() sents = [s.strip() for s in sents.split("\n") if s.strip()] return sents
def extract_encoded_text(encoded, sent_tokeniser, word_tokeniser, morph_analyser): if not sent_tokeniser: return encoded content = base64.b64decode(encoded).decode("utf-8").replace("\t", " ") tokenized_segs = [] seg = "" sent_tokeniser.writeline(html.escape(content.strip()) + "\n") while seg != "<P>": seg = sent_tokeniser.readline().strip() if seg != "" and seg != "<P>": tokenized_segs.append(html.unescape(seg)) tokenized_filtered = [] for sent in tokenized_segs: if sum([1 for m in sent if m in string.punctuation + string.digits ]) < len(sent) // 2: tokenized_filtered.append(sent) if not word_tokeniser: b64text = base64.b64encode(tokenized_filtered.lower().encode("utf-8")) return b64text.decode() tokenized_text = "" for sent in tokenized_filtered: word_tokeniser.writeline(sent) tokenized_text = tokenized_text + word_tokeniser.readline().strip( ) + "\n" if morph_analyser: proc_morph = ExternalTextProcessor( morph_analyser.split()) # Apertium does line buffering tokenized_text = proc_morph.process(tokenized_text) b64text = base64.b64encode(tokenized_text.lower().encode("utf-8")) return b64text.decode()
def word_tokenize(sents, wordtok): proc_word = ExternalTextProcessor(wordtok.split(' ')) ret = [] for sent in sents: words = proc_word.process(sent) ret.append(words.strip()) return ret
def split_sentences(original_text, sentence_splitter_cmd): if sentence_splitter_cmd: proc = ExternalTextProcessor(sentence_splitter_cmd.split()) text_split = proc.process(original_text.replace("\n\n", "\n")) else: text_split = original_text.replace("\n\n", "\n") output = html.unescape(text_split) return [n for n in output.split("\n") if filter_digits_and_punctuation(n)]
def extract_encoded_text(encoded, sent_tokeniser, word_tokeniser, morph_analyser): if not sent_tokeniser: return encoded proc_sent = ExternalTextProcessor(sent_tokeniser.split()) content = base64.b64decode(encoded).decode("utf-8").replace("\t", " ") tokenized_segs = proc_sent.process(content).strip() tokenized_filtered = "" for sent in tokenized_segs.split("\n"): if sum([1 for m in sent if m in string.punctuation + string.digits ]) < len(sent) // 2: tokenized_filtered += sent + "\n" if not word_tokeniser: b64text = base64.b64encode(tokenized_filtered.lower().encode("utf-8")) return b64text.decode() proc_word = ExternalTextProcessor(word_tokeniser.split()) tokenized_text = proc_word.process(tokenized_filtered) if morph_analyser: proc_morph = ExternalTextProcessor(morph_analyser.split()) tokenized_text = proc_morph.process(tokenized_text) b64text = base64.b64encode(tokenized_text.lower().encode("utf-8")) return b64text.decode()
def ngrams_from_text(n, hash_values, ignore_set, word_tokeniser_cmd, page): proc = ExternalTextProcessor(word_tokeniser_cmd.split(' ')) segments = proc.process(page).split("\n") words = [] for s in segments: words.extend(s.split(' ')) ngrams = _ngram_helper(words, n, hash_values) if ignore_set: return [ng for ng in ngrams if ng not in ignore_set] return ngrams
def extract_encoded_text(encodedtext, encodedtokenized, tmp_file, tmp_file_origtext, sent_tokeniser): proc_sent = ExternalTextProcessor(sent_tokeniser.split(' ')) content = base64.b64decode(encodedtext).decode("utf-8").replace("\t", " ") tokenized_segs = proc_sent.process(content).strip() tokenized_filtered = "" for sent in tokenized_segs.split("\n"): if sum([1 for m in sent if m in string.punctuation + string.digits ]) < len(sent) // 2: tokenized_filtered += sent + "\n" tmp_file_origtext.write(tokenized_filtered.encode()) content_tokenized = base64.b64decode(encodedtokenized) tmp_file.write(content_tokenized)
def langsplit(uri, langsplit_exec, text): cmd = [langsplit_exec, "--printchunks"] proc = ExternalTextProcessor(cmd) tld = uri.split("/")[0].split(".")[-1] header = u"%s tld:%s uri:%s\n" % (magic_numer, tld, uri) output = proc.process(u"\n".join([header, text])) if not output.strip(): res = langid.classify(text) lang = res[0] header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(), lang, len(text.encode("utf-8"))) return header + text return output
def split_sentences(text, sentence_splitter_cmd, lang): if not sentence_splitter_cmd: return text.split('\n') sentences = [] proc = ExternalTextProcessor([sentence_splitter_cmd, "-l", lang]) output = proc.process(text.replace("\n", "\n\n")) for line in output.split("\n"): line = line.strip() if not line or line == "<P>": continue sentences.append(line) return sentences
def langsplit(uri, text, langsplit_exec="/home/buck/net/build/mtma_bitext/html_convert/langsplit"): cmd = [langsplit_exec, "--printchunks"] proc = ExternalTextProcessor(cmd) tld = uri.split("/")[0].split(".")[-1] header = u"%s tld:%s uri:%s\n" % (magic_number, tld, uri) output = proc.process(u"\n".join([header, text])) if not output.strip(): import langid res = langid.classify(text) lang = res[0] header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(), lang, len(text.encode("utf-8"))) return header + text return output
def split_sentences(original_text, sentence_splitter_cmd, prune_type, prune_threshold): # print("original_text", len(original_text)) proc = ExternalTextProcessor(sentence_splitter_cmd.split()) tmp1 = original_text.replace("\n\n", "\n") # print("tmp1", len(tmp1)) tmp2 = proc.process(tmp1) # print("tmp2", len(tmp2)) tmp3 = html.unescape(tmp2) # print("tmp3", len(tmp3)) tmp4 = [n for n in tmp3.split("\n") if filter_digits_and_punctuation(n)] # print("tmp4", len(tmp4)) tmp5 = [] count = 0 for extracted_line in tmp4: extracted_line = extracted_line.strip() if not extracted_line: # print("empty line") continue if prune_type == "chars": if len(extracted_line) > prune_threshold: continue elif prune_type == "words": if len(extracted_line.split()) > prune_threshold: continue tmp5.append(extracted_line) count += 1 # print("tmp5", len(tmp5)) return tmp5
def write_sentences(html, lang, sent_tokenizer, outfile, lid=None): html = base64.b64decode(html).decode("utf8") tagchunks = parsers.parse(html, lang).split("\n") chunks = [ re_space.sub(" ", tc).strip() for tc in tagchunks if not re_tag.match(tc.strip()) ] proc_sent = ExternalTextProcessor(sent_tokenizer.split(' ')) dedup = set() n_sents = 0 for chunk in chunks: if chunk.strip(): if lid: pred = lid.predict([chunk])[0] if pred[0][0][9:] != lang: continue tokenized_segs = proc_sent.process(chunk).strip() for sent in tokenized_segs.split("\n"): if sent not in dedup: print(sent, file=outfile) dedup.add(sent) n_sents += 1 return n_sents
def get_tokenizer(cmd, lang="en"): if cmd == "moses": return MosesTokenizer(lang) elif cmd == "mecab": tagger = MeCab.Tagger("-Owakati") def mecab(text): return tagger.parse(text).strip().split() return mecab else: proc = ExternalTextProcessor(cmd.split()) def external(text): return proc.process(text).strip().split() return external
def extract_encoded_text(encodedtext, tmp_file, tmp_file_origtext, morphanal, sent_tokeniser, word_tokeniser): proc_sent = ExternalTextProcessor(sent_tokeniser.split(' ')) proc_word = ExternalTextProcessor(word_tokeniser.split(' ')) content = base64.b64decode(encodedtext).decode("utf-8").replace("\t", " ") tokenized_segs = proc_sent.process(content).strip() tmp_file_origtext.write(tokenized_segs.encode()) tokenized_text = proc_word.process(tokenized_segs) if morphanal is not None: morphanalyser = ["/bin/bash", morphanal] tokenized_text = run_analyse(morphanalyser, tokenized_text) tmp_file.write(tokenized_text.lower().encode())
def normalize(text, normalizer_cmd, lang): proc = ExternalTextProcessor([normalizer_cmd, lang]) output = proc.process(text.strip()) return output
def tokenize(text, tokenizer_cmd, lang): proc = ExternalTextProcessor([tokenizer_cmd, "-a", "-l", lang]) output = proc.process(text.strip()) return output
def split_sentences(text, sentence_splitter_cmd, lang): proc = ExternalTextProcessor([sentence_splitter_cmd, "-l", lang]) output = proc.process(text.replace("\n\n", "\n")) return [n for n in output.split("\n") if filter_digits_and_punctuation(n)]
help='url to English text', type=argparse.FileType('r')) parser.add_argument('-url2fr', help='url to French text', type=argparse.FileType('r')) parser.add_argument('-write', help='filename for pickle file', type=argparse.FileType('wb')) args = parser.parse_args() source_tokenizer = None if args.source_tokenizer: if args.source_tokenizer == 'WordPunctTokenizer': source_tokenizer = WordPunctTokenizer() else: source_tokenizer = ExternalTextProcessor(args.source_tokenizer) target_tokenizer = None if args.target_tokenizer: if args.target_tokenizer == 'WordPunctTokenizer': target_tokenizer = WordPunctTokenizer() else: target_tokenizer = ExternalTextProcessor(args.target_tokenizer) # read source and target corpus s, t = read_lett(args.lettfile, args.slang, args.tlang, source_tokenizer, target_tokenizer, False, args.url2fr, args.url2en, True) sys.stderr.write( "Read %d %s docs and %d %s docs from %s\n" % (len(s), args.slang, len(t), args.tlang, args.lettfile.name)) sys.stderr.write("Source stats: ")
) != 0 and options.morphanal2 is not None and lang == options.lang2: morphanalyser = ["/bin/bash", options.morphanal2] tpmorph = subprocess.Popen(morphanalyser, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) morph_stdout, error = tpmorph.communicate(input=text) if len(error.strip()) == 0: text = re.sub( r"\^\*?", r"", re.sub(r"[/<][^$]*\$", r"", morph_stdout.decode("utf-8"))) # Getting the bag of words in the document if lang == options.lang1: proc = ExternalTextProcessor(options.wordtokeniser1.split(' ')) elif lang == options.lang2: proc = ExternalTextProcessor(options.wordtokeniser2.split(' ')) sorted_uniq_wordlist = set(proc.process(text).lower().split()) # Trimming non-aplphanumerics: clean_sorted_uniq_wordlist = [ _f for _f in [w.strip(punctuation) for w in sorted_uniq_wordlist] if _f ] sorted_uniq_wordlist = clean_sorted_uniq_wordlist for word in sorted_uniq_wordlist: if lang in word_map: if word in word_map[lang]: