Esempio n. 1
0
    def __init__(self, training="aspect"):
        DataFetcher.download_data(UITABSARestaurant.NAME, None)
        train_file = join(DATASETS_FOLDER, UITABSARestaurant.NAME, "Train.txt")
        dev_file = join(DATASETS_FOLDER, UITABSARestaurant.NAME, "Dev.txt")
        test_file = join(DATASETS_FOLDER, UITABSARestaurant.NAME, "Test.txt")
        print("Currently training: %s (aspect or polarity)" % training)

        self.training = training  # aspect or polarity
        self.label_encoder = LabelEncoder()
        self.train = self._extract_sentences(train_file)
        self.dev = self._extract_sentences(dev_file)
        self.test = self._extract_sentences(test_file)
        self.num_labels = self.label_encoder.vocab_size
Esempio n. 2
0
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from underthesea.corpus.categorized_corpus import CategorizedCorpus
from underthesea.data_fetcher import DataFetcher
from underthesea.file_utils import DATASETS_FOLDER
from underthesea.models.text_classifier import TextClassifier, TEXT_CLASSIFIER_ESTIMATOR
from underthesea.trainers.classifier_trainer import ClassifierTrainer

model_folder = "tmp/sentiment_svm_ubs"
shutil.rmtree(model_folder, ignore_errors=True)
os.makedirs(model_folder)

start = time.time()
print(">>> Train UBS model")
data_folder = Path(join(DATASETS_FOLDER, "SE_Vietnamese-UBS-1"))
corpus: CategorizedCorpus = DataFetcher.load_classification_corpus(data_folder)
print("\n\n>>> Sample sentences")
for s in corpus.train[:10]:
    print(s)

pipeline = Pipeline(
    steps=[('features', TfidfVectorizer(ngram_range=(
        1, 2), max_df=0.5)), ('estimator', OneVsRestClassifier(LinearSVC()))])
print("\n\n>>> Start training")
classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                            pipeline=pipeline,
                            multilabel=True)
model_trainer = ClassifierTrainer(classifier, corpus)


def micro_f1_score(y_true, y_pred):
Esempio n. 3
0
 def download_file():
     if isdir(VLSP2020_DP_SAMPLE.folder):
         return
     shutil.rmtree(VLSP2020_DP_SAMPLE.folder, ignore_errors=True)
     DataFetcher.download_zip_file_to_cache(VLSP2020_DP_SAMPLE.REPO_DATA)
Esempio n. 4
0
def remove_data(data):
    DataFetcher.remove(data)
Esempio n. 5
0
def download_data(dataset, url):
    DataFetcher.download_data(dataset, url)
Esempio n. 6
0
def list_data(all):
    DataFetcher.list(all)