Esempi in Python per to_raw_text_markupless

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: xml_cleaner

Metodo/funzione: to_raw_text_markupless

Esempi su hotexamples.com: 9

to_raw_text_markupless in Python: 9 esempi trovati. Questi sono i migliori esempi reali in Python per xml_cleaner.to_raw_text_markupless, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: documenttree.py Progetto: DevSinghSachan/lm_organizer

 def __init__(self, name, document, vocab, index):
     self.index = index
     self.name = name
     self.words = np.array([
         vocab.get(word) for sentence in to_raw_text_markupless(document)
         for word in sentence if vocab.get(word)
     ],
                           dtype='int32')
     self.size = len(self.words)

Esempio n. 2

Mostra file

File: parser.py Progetto: bhack/Dali

def to_token_string(text):
    tokens = to_raw_text_markupless(text)
    tokens = [' '.join(sentence_tokens) for sentence_tokens in tokens]
    tokens = ' '.join(tokens)
    return tokens

Esempio n. 3

Mostra file

def to_token_string(text):
    tokens = to_raw_text_markupless(text)
    tokens = [' '.join(sentence_tokens) for sentence_tokens in tokens]
    tokens = ' '.join(tokens)
    return tokens

Esempio n. 4

Mostra file

File: tokenize_split.py Progetto: bhack/Dali

def tokenize_and_write(file, text, token):
    for sentence in to_raw_text_markupless(text):
        file.write(" ".join(sentence))
        file.write(token)

Esempio n. 5

Mostra file

File: __init__.py Progetto: DevSinghSachan/lm_organizer

def collect_counts(documents):
    vocab = Counter()
    for value in documents.values():
        vocab.update(word for sentence in to_raw_text_markupless(value)
                     for word in sentence)
    return vocab

Esempio n. 6

Mostra file

File: __init__.py Progetto: JonathanRaiman/lm_organizer

def collect_counts(documents):
    vocab = Counter()
    for value in documents.values():
        vocab.update(word for sentence in to_raw_text_markupless(value) for word in sentence)
    return vocab

Esempio n. 7

Mostra file

def tokenize_sentences(text):
    sentences = text.strip().split("\t")
    gen_sentences = [" ".join(tsentence) for sentence in sentences for tsentence in to_raw_text_markupless(sentence)]
    return "\t".join(gen_sentences[0:2]) + " ".join(gen_sentences[2:])

Esempio n. 8

Mostra file

    print("Generated %d question answer pairs" % (len(output_content) ))
    print("Skipped %d pairs because of answer shorter than %d words" % (num_too_short, MIN_ANSWER_LENGTH))
    print("Skipped %d because of encoding issues." % (num_nonascii,))

    num_valid = 0
    num_train = 0

    with open(VALIDATE_FILE, 'wt') as fvalid:
        with open(TRAIN_FILE, 'wt') as ftrain:

            for i, qa in enumerate(output_content):
                question, answer = qa
                print_progress(i, len(output_content))
                question_tokens = []
                answer_tokens = []
                for line in to_raw_text_markupless(question):
                    question_tokens.extend(line)
                for line in to_raw_text_markupless(answer):
                    answer_tokens.extend(line)

                output_line = '%s\t%s\n' % (' '.join(question_tokens), ' '.join(answer_tokens))
                if random.random() < VALIDATION_SIZE:
                    fvalid.write(output_line)
                    num_valid += 1
                else:
                    ftrain.write(output_line)
                    num_train += 1



    print("Saved %d pairs in %s" % (num_train, TRAIN_FILE))

Esempio n. 9

Mostra file

def tokenize_and_write(file, text, token):
    for sentence in to_raw_text_markupless(text):
        file.write(" ".join(sentence))
        file.write(token)