def raw_to_corpus(sample, output): if output: output_folder = output else: output_folder = join(dirname(dirname(__file__)), "tmp", "vlsp2013") try: makedirs(output_folder) except Exception as e: pass raw_folders = ["Trainset-POS-full", "Testset-POS"] output_names = ["train.txt", "test.txt"] data_folder = join(dirname(dirname(__file__)), "data", "vlsp2013", "raw") for i, raw_folder in enumerate(raw_folders): tagged_corpus = TaggedCorpus() sentences = [] files = listdir(join(data_folder, raw_folder)) files = [join(data_folder, raw_folder, file) for file in files] for file in files: sentences += preprocess(file) if sample != None: if len(sentences) > sample: sentences = sentences[:sample] break tagged_corpus.sentences = sentences output_file = join(output_folder, output_names[i]) tagged_corpus.save(output_file) print("{} sentences is saved to file {}".format(len(sentences), output_file))
def raw_to_corpus(sample, output): if output: output_folder = output else: output_folder = join(dirname(dirname(__file__)), "tmp", "vlsp2013") try: makedirs(output_folder) except Exception as e: pass raw_folders = ["Trainset-POS-full", "Testset-POS"] output_names = ["train.txt", "test.txt"] data_folder = join(dirname(dirname(__file__)), "data", "vlsp2013", "raw") for i, raw_folder in enumerate(raw_folders): tagged_corpus = TaggedCorpus() sentences = [] files = listdir(join(data_folder, raw_folder)) files = [join(data_folder, raw_folder, file) for file in files] for file in files: sentences += preprocess(file) if sample != None: if len(sentences) > sample: sentences = sentences[:sample] break tagged_corpus.sentences = sentences output_file = join(output_folder, output_names[i]) tagged_corpus.save(output_file) print("{} sentences is saved to file {}".format( len(sentences), output_file))
def load_data(file): tagged_corpus = TaggedCorpus() tagged_corpus.load(file) sentences = [] for s in tagged_corpus.sentences: sentence = [(r[0], r[3]) for r in [i[0].split() for i in s]] sentences.append(sentence) return sentences
def sample_data(n=200): tagged_corpus = TaggedCorpus() file = join(dirname(__file__), "corpus", "vlsp_chunk", "train.txt") tagged_corpus.load(file) sentences = tagged_corpus.sentences[:n] sample_corpus = TaggedCorpus(sentences) file = join(dirname(__file__), "corpus", "vlsp_chunk_sample", "train.txt") sample_corpus.save(file)
def raw_to_corpus(): for f in ["train.txt", "dev.txt", "test.txt"]: tagged_corpus = TaggedCorpus() input = join(dirname(__file__), "raw", f) tagged_corpus.load(input) tagged_corpus.sentences = preprocess(tagged_corpus.sentences) output = join(dirname(__file__), "corpus", f) tagged_corpus.save(output)
def raw_to_sample_corpus(): for f in ["train.txt", "dev.txt", "test.txt"]: tagged_corpus = TaggedCorpus() input = join(dirname(dirname(__file__)), "raw", "vlsp2016", f) tagged_corpus.load(input) tagged_corpus.sentences = preprocess(tagged_corpus.sentences)[:100] output = join(dirname(dirname(__file__)), "corpus", "sample_vlsp_2016", f) tagged_corpus.save(output)
def raw_to_corpus(): for f in ["train.txt", "dev.txt", "test.txt"]: tagged_corpus = TaggedCorpus() input = join(dirname(__file__), "raw", f) tagged_corpus.load(input) tagged_corpus.sentences = preprocess(tagged_corpus.sentences) corpus_folder = join(dirname(__file__), "corpus") try: mkdir(corpus_folder) except: pass output = join(corpus_folder, f) tagged_corpus.save(output)
def raw_to_corpus(sample, output): if output: output_folder = output else: output_folder = join(dirname(dirname(__file__)), "tmp", "vlsp2016") for f in ["train.txt", "dev.txt", "test.txt"]: input = join(dirname(dirname(__file__)), "data", "vlsp2016", "raw", f) tagged_corpus = TaggedCorpus() tagged_corpus.load(input) sentences = tagged_corpus.sentences if sample: sentences = sentences[:sample] tagged_corpus.sentences = preprocess(sentences) try: makedirs(output_folder) except: pass output_data = join(output_folder, f) tagged_corpus.save(output_data)
from os.path import join from languageflow.reader.tagged_corpus import TaggedCorpus if __name__ == "__main__": total_corpus = TaggedCorpus() for file in ["train", "test", "dev"]: input_file = join("corpus", file + ".txt") output_folder = join("eda", file) corpus = TaggedCorpus() corpus.load(input_file) total_corpus.sentences += corpus.sentences corpus.analyze(output_folder=output_folder, auto_remove=True) total_corpus.analyze(output_folder=join("eda", "total"))
def load_data(file): tagged_corpus = TaggedCorpus() tagged_corpus.load(file) sentences = tagged_corpus.sentences return sentences
def test_load(self): data_file = join(dirname(dirname((__file__))), "data", "vi-chunk.train") tagged_corpus = TaggedCorpus() tagged_corpus.load(data_file)
def load_dataset(file): tagged_corpus = TaggedCorpus() tagged_corpus.load(file) sentences = tagged_corpus.sentences return sentences