Exemple #1
0
    parser.add_argument('--tokenized1', required=True,
                        help='Path to the file with the tokenized text of the documents crawled in LANG1')
    parser.add_argument('--text2', help='Path to the file with the plain text of the documents in LANG2', required=True)
    parser.add_argument('--tokenized2', required=True,
                        help='Path to the file with the tokenized text of the documents crawled in LANG2')
    parser.add_argument('--column1', help='Column that contains the first document of the document pairs',
                        default=0, type=int)
    parser.add_argument('--column2', help='Column that contains the second document of the document pairs',
                        default=1, type=int)

    args = parser.parse_args()

    lang2_docs = set()
    lang2_read_docs = {}

    with open_xz_or_gzip_or_plain(args.indices, 'rt') as reader, \
            open_xz_or_gzip_or_plain(args.tokenized1) as tok_reader1, \
            open_xz_or_gzip_or_plain(args.tokenized2) as tok_reader2, \
            open_xz_or_gzip_or_plain(args.text1) as text_reader1, \
            open_xz_or_gzip_or_plain(args.text2) as text_reader2:

        for line in reader:
            fields = line.split('\t')
            lang2_docs.add(int(fields[args.column2]))

        reader.seek(0)

        doc1_current_line = 1
        doc2_current_line = 1
        doc2_last_written = 0
Exemple #2
0
    lang = os.fsdecode(langfolder)
    if not os.path.isdir(options.folder + "/" + lang) or len(lang) > 2:
        continue
    fullname = os.path.join(options.folder, lang + "/plain_text.xz")
    if os.path.isfile(fullname) and (not langs or lang in langs):
        if lang not in lang_files:
            lang_files[lang] = lzma.open(
                os.path.join(options.folder, lang + "/plain_tokenized.xz"),
                "wb")
        senttok_command = get_lang_or_default(options.splitters, lang)
        senttok = None
        if senttok_command:
            senttok = ToolWrapper(senttok_command.split())
        wordtok_command = get_lang_or_default(options.tokenizers, lang)
        wordtok = None
        if wordtok_command:
            wordtok = ToolWrapper(wordtok_command.split())
        morphtok_command = get_lang_or_default(options.lemmatizers, lang)
        morphtok = None
        if morphtok_command:
            morphtok = ToolWrapper(morphtok_command.split())
        with open_xz_or_gzip_or_plain(fullname) as text_reader:
            for line in text_reader:
                encodedtext = line.strip()
                tokenized = extract_encoded_text(encodedtext, senttok, wordtok,
                                                 morphtok)
                lang_files[lang].write(
                    "{}\n".format(tokenized).encode("utf-8"))
for lang in lang_files:
    lang_files[lang].close()
Exemple #3
0
oparser.add_argument('--morph-analyser',
                     dest='lemmatizer',
                     help="Morphological analyser command")
oparser.add_argument(
    '--sentences-output',
    default="plain_sentences.xz",
    dest='sent_output',
    help="Path of the output file that will contain sentence splitted text")
oparser.add_argument(
    '--tokenized-output',
    default="plain_tokenized.xz",
    dest='tok_output',
    help=
    "Path of the output file that will contain sentence splitted and tokenized text"
)

options = oparser.parse_args()

with open_xz_or_gzip_or_plain(options.text) as reader, lzma.open(
        options.sent_output,
        "w") as sent_writer, lzma.open(options.tok_output, "w") as tok_writer:
    for line in reader:
        encoded_text = line.strip()
        sentences, tokenized = extract_encoded_text(encoded_text,
                                                    options.splitter,
                                                    options.tokenizer,
                                                    options.lemmatizer)
        if sentences and tokenized:
            sent_writer.write(sentences + b"\n")
            tok_writer.write(tokenized + b"\n")
    dest="no_delete_seg",
    action='store_true')
oparser.add_argument("-f",
                     "--text-file-deduped",
                     help="Filename to write the deduped input file",
                     dest="text_file_deduped")
oparser.add_argument(
    "--dedup",
    dest="dedup",
    help=
    "Dedup entries and group urls using given columns. Like 'bifixerhash', 'seg1,seg2' , 'checksum1,checksum2'"
)

options = oparser.parse_args()

with open_xz_or_gzip_or_plain(options.clean_alignments, 'rt') if options.clean_alignments else sys.stdin as reader,\
        open_xz_or_gzip_or_plain(options.text_file_deduped, 'wt') if options.text_file_deduped and options.dedup else dummy_open() as text_writer:

    print("<?xml version=\"1.0\"?>")
    print("<tmx version=\"1.4\">")
    print(" <header")
    print("   adminlang=\"" +
          locale.setlocale(locale.LC_ALL, '').split(".")[0].split("_")[0] +
          "\"")
    print("   srclang=\"" + options.lang1 + "\"")
    print("   o-tmf=\"PlainText\"")
    print("   creationtool=\"bitextor\"")
    print("   creationtoolversion=\"4.0\"")
    print("   datatype=\"PlainText\"")
    print("   segtype=\"sentence\"")
    print("   creationdate=\"" + time.strftime("%Y%m%dT%H%M%S") + "\"")
Exemple #5
0
oparser.add_argument(
    "--lang2",
    help="Two-characters-code for language 2 in the pair of languages",
    dest="lang2",
    required=True)

options = oparser.parse_args()

docnumber = 0
word_map = {}

punctuation = get_unicode_punct()

for file_path, lang in [(options.text1, options.lang1),
                        (options.text2, options.lang2)]:
    with open_xz_or_gzip_or_plain(file_path) as text_reader:

        for line in text_reader:
            ##################
            # Parsing the text:
            ##################
            tokenized_text = base64.b64decode(line.strip()).decode("utf-8")

            sorted_uniq_wordlist = set(tokenized_text.split())

            # Trimming non-aplphanumerics:
            clean_sorted_uniq_wordlist = [
                _f
                for _f in [w.strip(punctuation) for w in sorted_uniq_wordlist]
                if _f
            ]
                        required=True)
    parser.add_argument('--columns2',
                        dest='lang2_column_filename',
                        nargs='+',
                        required=True)

    args = parser.parse_args()

    lang2_docs = set()
    lang2_read_docs = {}
    indices = list()

    if not args.indices:
        args.indices = '-'

    with open_xz_or_gzip_or_plain(
            args.indices) if args.indices != '-' else sys.stdin as reader:
        for line in reader:
            fields = line.strip().split('\t')
            lang2_docs.add(int(fields[1]))
            indices.append((int(fields[0]), int(fields[1])))

    readers1 = [
        open_xz_or_gzip(filename, 'rt')
        for filename in args.lang1_column_filename
    ]
    readers2 = [
        open_xz_or_gzip(filename, 'rt')
        for filename in args.lang2_column_filename
    ]

    doc1_current_line = 1
                     help="Word tokeniser script for language 1",
                     dest="wordtokeniser1",
                     required=True)
oparser.add_argument("--wordtokeniser2",
                     help="Word tokeniser script for language 2",
                     dest="wordtokeniser2",
                     required=True)

options = oparser.parse_args()

docnumber = 0
word_map = {}

punctuation = get_unicode_punct()

with open_xz_or_gzip_or_plain(options.text) as text_reader:
    with open_xz_or_gzip_or_plain(options.lang) as lang_reader:

        for line in text_reader:
            ##################
            # Parsing the text:
            ##################
            text = base64.b64decode(line.strip()).decode("utf-8")
            lang = next(lang_reader, None).strip()
            proc = None

            if len(
                    text.strip()
            ) != 0 and options.morphanal1 is not None and lang == options.lang1:
                morphanalyser = ["/bin/bash", options.morphanal1]
                spmorph = subprocess.Popen(morphanalyser,
Exemple #8
0
    proc_word = ExternalTextProcessor(word_tokeniser.split())
    tokenized_text = proc_word.process(tokenized_filtered)

    if morph_analyser:
        proc_morph = ExternalTextProcessor(morph_analyser.split())
        tokenized_text = proc_morph.process(tokenized_text)

    b64sentences = base64.b64encode(tokenized_filtered.encode("utf-8"))
    b64tokenized = base64.b64encode(tokenized_text.lower().encode("utf-8"))
    return b64sentences, b64tokenized


oparser = argparse.ArgumentParser(
    description="Tool that tokenizes (sentences, tokens and morphemes) plain text")
oparser.add_argument('--text', dest='text', help='Plain text file', required=True)
oparser.add_argument('--sentence-splitter', dest='splitter', required=True, help="Sentence splitter commands")
oparser.add_argument('--word-tokenizer', dest='tokenizer', required=True, help="Word tokenisation command")
oparser.add_argument('--morph-analyser', dest='lemmatizer', help="Morphological analyser command")
oparser.add_argument('--sentences-output', default="plain_sentences.xz", dest='sent_output', help="Path of the output file that will contain sentence splitted text")
oparser.add_argument('--tokenized-output', default="plain_tokenized.xz", dest='tok_output', help="Path of the output file that will contain sentence splitted and tokenized text")

options = oparser.parse_args()

with open_xz_or_gzip_or_plain(options.text) as reader, open_xz_or_gzip_or_plain(options.sent_output, "w") as sent_writer, open_xz_or_gzip_or_plain(options.tok_output, "w") as tok_writer:
    for line in reader:
        encoded_text = line.strip()
        sentences, tokenized = extract_encoded_text(encoded_text, options.splitter, options.tokenizer, options.lemmatizer)
        if sentences and tokenized:
            sent_writer.write(sentences + b"\n")
            tok_writer.write(tokenized + b"\n")
Exemple #9
0
    help="Write document alignments even if they are not backwards aligned ("
    "option incompatible with 'iterations' option)",
    dest="nonsymmetric",
    action="store_true")

options = oparser.parse_args()

indices = {}
indicesProb = {}
documents = set(range(1, options.ndoc1 + options.ndoc2 + 1))
documentsFile2 = set()
file2_start_counter = options.ndoc1

if options.ridx2 is None:
    # Reading the .ridx file with the preliminary alignment
    with open_xz_or_gzip_or_plain(options.ridx1) if options.ridx1 else sys.stdin as reader, \
            open_xz_or_gzip_or_plain(options.oridx, 'wt') if options.oridx else dummy_open() as oridx_writer:
        for i in reader:
            fields = i.split("\t")
            if len(fields) >= 2:
                if oridx_writer:
                    oridx_writer.write(i)
                try:
                    indices[int(fields[0])] = int(
                        fields[1].strip().split(":")[0])
                except:
                    pass
else:
    with open_xz_or_gzip_or_plain(options.ridx1, 'rt') as reader1, \
            open_xz_or_gzip_or_plain(options.ridx2, 'rt') as reader2, \
            open_xz_or_gzip_or_plain(options.oridx, 'wt') if options.oridx else dummy_open() as oridx_writer:
Exemple #10
0
splitter = options.splitter
splitter_func = None
# no sentence splitter command provided, use moses:
if not splitter:
    splitter_func = split_moses
    try:
        if options.customnbp:
            splitter = SentenceSplitter(
                language=options.langcode,
                non_breaking_prefix_file=options.customnbp)
        else:
            splitter = SentenceSplitter(language=options.langcode)
    except SentenceSplitterException as e:
        sys.stderr.write(str(e) + "\n")
        splitter = SentenceSplitter(language='en')

# use custom sentence splitter via ExternalTextProcessor (inefficient):
else:
    splitter_func = split_external
    splitter = ExternalTextProcessor(os.path.expanduser(splitter).split())

with open_xz_or_gzip_or_plain(
        options.text) if options.text != "-" else sys.stdin as reader:
    for doc in reader:
        content = base64.b64decode(doc.strip()).decode("utf-8").replace(
            "\t", " ")
        sentences = splitter_func(content, splitter, options.prune_type,
                                  options.prune_threshold)
        print(base64.b64encode(sentences.encode("utf-8")).decode("utf-8"))
Exemple #11
0
        reader1 = lzma.open(options.ridx1, "rt")
    else:
        reader1 = open(options.ridx1, "r")
    if options.ridx2[-3:] == ".xz":
        reader2 = lzma.open(options.ridx2, "rt")
    else:
        reader2 = open(options.ridx2, "r")

indices = {}
indicesProb = {}
documents = {}
documentsFile2 = set()

# Files containing base64 encoded text and url are read and stored in a document map
counter = 1
with open_xz_or_gzip_or_plain(options.text) as text_reader:
    with open_xz_or_gzip_or_plain(options.url) as url_reader:
        for text in text_reader:
            url = next(url_reader, None).strip()
            documents[counter] = (url, text.strip())  # URL parsed_text_base64
            counter += 1

if not combine:
    # Reading the .ridx file with the preliminary alignment
    for i in reader:
        fields = i.split("\t")
        if len(fields) >= 2:
            if options.oridx is not None:
                options.oridx.write(i)
            try:
                indices[int(fields[0])] = int(fields[1].strip().split(":")[0])