Ejemplo n.º 1
0
def corpus_prep(corpus_path):
    """
    Preprocess the corpus.
    Return:
        A set of part-of-speech 5~grams of corpus.

    """

    corpus_speech = set()
    pattern = re.compile(r"/\w+\s*")
    with open(corpus_path, 'r') as f:
        sent = f.readline()
        while sent:
            text = "".join(re.split(pattern, sent))
            text = text.replace('[', '')
            text = text.replace(']', '')
            print(text)
            ss = Sentence(text=text, language="ch")
            ss.get_part_of_speech()
            for s in ss.speeches_gram.keys():
                corpus_speech.add(s)
            sent = f.readline()
    return corpus_speech