def __init__(self, training="aspect"): DataFetcher.download_data(UITABSARestaurant.NAME, None) train_file = join(DATASETS_FOLDER, UITABSARestaurant.NAME, "Train.txt") dev_file = join(DATASETS_FOLDER, UITABSARestaurant.NAME, "Dev.txt") test_file = join(DATASETS_FOLDER, UITABSARestaurant.NAME, "Test.txt") print("Currently training: %s (aspect or polarity)" % training) self.training = training # aspect or polarity self.label_encoder = LabelEncoder() self.train = self._extract_sentences(train_file) self.dev = self._extract_sentences(dev_file) self.test = self._extract_sentences(test_file) self.num_labels = self.label_encoder.vocab_size
from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from underthesea.corpus.categorized_corpus import CategorizedCorpus from underthesea.data_fetcher import DataFetcher from underthesea.file_utils import DATASETS_FOLDER from underthesea.models.text_classifier import TextClassifier, TEXT_CLASSIFIER_ESTIMATOR from underthesea.trainers.classifier_trainer import ClassifierTrainer model_folder = "tmp/sentiment_svm_ubs" shutil.rmtree(model_folder, ignore_errors=True) os.makedirs(model_folder) start = time.time() print(">>> Train UBS model") data_folder = Path(join(DATASETS_FOLDER, "SE_Vietnamese-UBS-1")) corpus: CategorizedCorpus = DataFetcher.load_classification_corpus(data_folder) print("\n\n>>> Sample sentences") for s in corpus.train[:10]: print(s) pipeline = Pipeline( steps=[('features', TfidfVectorizer(ngram_range=( 1, 2), max_df=0.5)), ('estimator', OneVsRestClassifier(LinearSVC()))]) print("\n\n>>> Start training") classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline, multilabel=True) model_trainer = ClassifierTrainer(classifier, corpus) def micro_f1_score(y_true, y_pred):
def download_file(): if isdir(VLSP2020_DP_SAMPLE.folder): return shutil.rmtree(VLSP2020_DP_SAMPLE.folder, ignore_errors=True) DataFetcher.download_zip_file_to_cache(VLSP2020_DP_SAMPLE.REPO_DATA)
def remove_data(data): DataFetcher.remove(data)
def download_data(dataset, url): DataFetcher.download_data(dataset, url)
def list_data(all): DataFetcher.list(all)