Esempio n. 1
0
    def score(self, X, y=None):
        train_loss = 0
        criterion = nn.CrossEntropyLoss()
        iter_data = DataLoader(X, batch_size=self.module__batch_size, shuffle=True)
        log_exp_run = make_logger(name="experiment_" + self.mode)

        predictions = []
        labels = []
        self.module_.to(self.device)
        self.module_.eval()

        with torch.no_grad():
            for bach in iter_data:
                x_test = bach['features'].type(torch.LongTensor)
                y_test = bach['labels'].type(torch.LongTensor)
                x_test = x_test.to(self.device)
                y_test = y_test.to(self.device)
                prob = self.module_(x_test)
                loss = criterion(prob, y_test)
                train_loss += loss.item()
                _, predicted = torch.max(prob.data, 1)
                predictions.extend(predicted.cpu().numpy())
                labels.extend(y_test.cpu().numpy())

        accuracy = accuracy_score(predictions, labels)

        log_exp_run.experiments("Cross-entropy loss for each fold: " + str(train_loss))
        log_exp_run.experiments("Accuracy for each fold: " + str(accuracy))
        log_exp_run.experiments("\n"+classification_report(labels, predictions))
        return accuracy
Esempio n. 2
0
 def __init__(self,*args,mode="Adam",**kargs):
     super().__init__(*args, **kargs)
     #self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.mode=mode
     log_exp_run = make_logger(name="experiment_" + self.mode)
     log_exp_run.experiments("Running on device: "+str(self.device))
     log_exp_run.experiments("Training model by Back-propagation with optimizer: "+mode)
Esempio n. 3
0
def build_spanish_glove_from_pretrained(url_pretrained_model,url_dictionary):
    from gensim.models.keyedvectors import KeyedVectors
    wordvectors_file_vec = url_pretrained_model+'/glove-sbwc.i25.vec'
    cantidad = 100000
    wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=cantidad)
    embedding_dict = {}
    log_exp_run = make_logger()
    start_time = time.time()
    for word in wordvectors.vocab:
        embedding_dict[word] = np.asarray(wordvectors.wv.get_vector(word), dtype='float32')

    log_exp_run.experiments("Loaded spanish word embedding model with GloVe:")
    log_exp_run.experiments("EMBEDDING_SIZE: " + str(len(embedding_dict["the"])))
    log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(embedding_dict)))
    log_exp_run.experiments("Time elapsed for loading embedding vectors from file: " + str(time.time() - start_time))

    word_index = torch.load(url_dictionary)
    embedding_matrix = np.random.random((len(word_index) + 1, 300))

    log_exp_run.experiments("Length of dictionary of dataset: " + str(len(word_index)))

    for word, i in word_index.items():
        embedding_vector = embedding_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # print(wordvectors.similarity("celular","computadora"))
    # print(wordvectors.most_similar_cosmul(positive=['cantante','grabación'],negative=['concierto']))
    return torch.FloatTensor(embedding_matrix)
Esempio n. 4
0
def build_glove_from_pretrained(url_pretrained_model,url_dictionary):
    embedding_dict={}
    log_exp_run = make_logger()
    file_pretrained = open(url_pretrained_model+"/glove.6B.100d.txt","r",encoding='ANSI')#removing encoding in unix-based os ,encoding='ANSI'
    start_time = time.time()
    lines = file_pretrained.readlines()
    for line in lines:
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:],dtype='float32')
        embedding_dict[word] = coefs

    file_pretrained.close()

    log_exp_run.experiments("Loaded word embedding model with GloVe:")
    log_exp_run.experiments("EMBEDDING_SIZE: " + str(len(embedding_dict["the"])))
    log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(embedding_dict)))
    log_exp_run.experiments("Time elapsed for loading embedding vectors from file: " + str(time.time() - start_time))

    word_index=torch.load(url_dictionary)
    embedding_matrix=np.random.random((len(word_index)+1,100))

    log_exp_run.experiments("Length of dictionary of dataset: "+str(len(word_index)))

    for word,i in word_index.items():
        embedding_vector=embedding_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return torch.FloatTensor(embedding_matrix)
Esempio n. 5
0
    def on_epoch_end(self, net, **kwargs):
        loss = net.history[-1, 'train_loss']
        self.loss_all.append(loss)
        self.cont_epoch += 1

        # early stoping
        if len(self.loss_all) > 1:
            if abs(self.loss_all[self.cont_epoch - 1] -
                   self.loss_all[self.cont_epoch - 2]) < self.min_diference:
                self.cont -= 1
            else:
                self.cont = 10

        if self.cont == 0:
            log_exp_run = make_logger()
            log_exp_run.experiments(self.loss_all)
            raise Overfit_Exception()
Esempio n. 6
0
def build_dataset_and_dict():
    categories = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','comp.windows.x','sci.electronics']
    newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True,categories=categories)
    newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True,categories=categories)
    texts = newsgroups_train['data']
    labels = newsgroups_train['target']

    log_exp_run = make_logger()
    log_exp_run.experiments("Categories-labels: ")
    log_exp_run.experiments(list(newsgroups_train.target_names))
    log_exp_run.experiments("Dictionary scheme: ")
    log_exp_run.experiments(list(newsgroups_train.keys()))
    log_exp_run.experiments("Number of instances for training: ")
    log_exp_run.experiments(len(newsgroups_train['data']))
    log_exp_run.experiments("Number of instances for testing: ")
    log_exp_run.experiments(len(newsgroups_test['data']))

    removing_stop_words(texts)
    dataset_train = {'features': [], 'labels': []}
    max_sequence_length = 1000
    max_nb_words = 2000
    tokenizer=Tokenizer(num_words=max_nb_words)
    tokenizer.fit_on_texts(texts)
    sequences_train=tokenizer.texts_to_sequences(texts)
    word_index=tokenizer.word_index
    log_exp_run.experiments("Found unique tokens: "+str(len(word_index)))

    wdir = os.getcwd()
    if not os.path.exists(wdir + '/datasets/dataset_train_20ng_nosw_six_labels'):
        dataset_train['features']=pad_sequences(sequences_train, maxlen=max_sequence_length)#[0:5]
        dataset_train['labels']=labels#[0:5]
        torch.save(dataset_train, wdir + "/datasets/dataset_train_20ng_nosw_six_labels")

    dataset_test = {'features': [], 'labels': []}
    texts = newsgroups_test['data']
    labels = newsgroups_test['target']
    removing_stop_words(texts)
    sequences_test = tokenizer.texts_to_sequences(texts)

    if not os.path.exists(wdir + '/datasets/dataset_test_20ng_nosw_six_labels'):
        dataset_test['features']=pad_sequences(sequences_test, maxlen=max_sequence_length)#[0:5]
        dataset_test['labels']=labels#[0:5]
        torch.save(dataset_test, wdir + "/datasets/dataset_test_20ng_nosw_six_labels")

    if not os.path.exists(wdir + '/datasets/dictionary_20ng_nosw_six_labels'):
        torch.save(word_index, wdir + "/datasets/dictionary_20ng_nosw_six_labels")
Esempio n. 7
0
def build_word_embedding(url_pretrained_model):
    model=None
    log_exp_run = make_logger()
    if not os.path.exists(url_pretrained_model+"/word2vec"+"_"+str(EMBEDDING_SIZE)+".model"):
        model = gensim.models.Word2Vec(brown.sents(), size=EMBEDDING_SIZE, window=WINDOW, min_count=MIN_COUNT,
                                       negative=NEGATIVE_SAMPLING, iter=EPOCHS, workers=multiprocessing.cpu_count())
        model.save(url_pretrained_model+"/word2vec"+"_"+str(EMBEDDING_SIZE)+".model")
        log_exp_run.experiments("Created and saved word embedding model with:")
        log_exp_run.experiments("EMBEDDING_SIZE: "+ str(EMBEDDING_SIZE))
        log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(model.wv.vocab)))
    else:
        model=gensim.models.Word2Vec.load(url_pretrained_model+"/word2vec"+"_"+str(EMBEDDING_SIZE)+".model")
        log_exp_run.experiments("Loaded word embedding model with:")
        log_exp_run.experiments("EMBEDDING_SIZE: " + str(EMBEDDING_SIZE))
        log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(model.wv.vocab)))

    return model
Esempio n. 8
0
def build_dataset_and_dict():
    os.chdir('../')
    path_dataset = "C:\\Users\\Laptop\\Desktop\\youtube"

    file_train = path_dataset + "/Youtube01-Psy.csv"

    f = open(file_train, "r", encoding="utf8")
    lines = f.readlines()
    lines = lines[1:]
    y = []
    X = []

    for line in lines:
        text, label = extract_text_label(line)
        X.append(text)
        y.append(label)

    file_train = path_dataset + "/Youtube02-KatyPerry.csv"

    f = open(file_train, "r", encoding="utf8")
    lines = f.readlines()
    lines = lines[1:]

    for line in lines:
        text, label = extract_text_label(line)
        X.append(text)
        y.append(label)

    file_train = path_dataset + "/Youtube03-LMFAO.csv"

    f = open(file_train, "r", encoding="utf8")
    lines = f.readlines()
    lines = lines[1:]

    for line in lines:
        text, label = extract_text_label(line)
        X.append(text)
        y.append(label)

    file_train = path_dataset + "/Youtube04-Eminem.csv"

    f = open(file_train, "r", encoding="utf8")
    lines = f.readlines()
    lines = lines[1:]

    for line in lines:
        text, label = extract_text_label(line)
        X.append(text)
        y.append(label)

    file_train = path_dataset + "/Youtube05-Shakira.csv"

    f = open(file_train, "r", encoding="utf8")
    lines = f.readlines()
    lines = lines[1:]

    for line in lines:
        text, label = extract_text_label(line)
        X.append(text)
        y.append(label)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=142)
    bbc_train = {'data': X_train, 'target': y_train}
    bbc_test = {'data': X_test, 'target': y_test}
    texts = bbc_train['data']
    labels_target = bbc_train['target']

    log_exp_run = make_logger()

    log_exp_run.experiments("Number of instances for training: ")
    log_exp_run.experiments(len(bbc_train['data']))
    log_exp_run.experiments("Number of instances for testing: ")
    log_exp_run.experiments(len(bbc_test['data']))

    removing_stop_words(texts)
    dataset_train = {'features': [], 'labels': []}
    max_sequence_length = 1000
    max_nb_words = 2000
    tokenizer = Tokenizer(num_words=max_nb_words)
    tokenizer.fit_on_texts(texts)
    sequences_train = tokenizer.texts_to_sequences(texts)

    wdir = os.getcwd()
    if not os.path.exists(wdir + '/datasets/dataset_train_youtube_nosw'):
        dataset_train['features'] = pad_sequences(
            sequences_train, maxlen=max_sequence_length)  #[0:5]
        dataset_train['labels'] = labels_target  #[0:5]
        torch.save(dataset_train,
                   wdir + "/datasets/dataset_train_youtube_nosw")

    dataset_test = {'features': [], 'labels': []}
    texts = bbc_test['data']
    labels_target = bbc_test['target']
    removing_stop_words(texts)

    sequences_test = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    log_exp_run.experiments("Found unique tokens: " + str(len(word_index)))

    if not os.path.exists(wdir + '/datasets/dataset_test_youtube_nosw'):
        dataset_test['features'] = pad_sequences(
            sequences_test, maxlen=max_sequence_length)  #[0:5]
        dataset_test['labels'] = labels_target  #[0:5]
        torch.save(dataset_test, wdir + "/datasets/dataset_test_youtube_nosw")

    if not os.path.exists(wdir + '/datasets/dictionary_youtube_nosw'):
        torch.save(word_index, wdir + "/datasets/dictionary_youtube_nosw")
Esempio n. 9
0
    def fit(self, X, y=None, **fit_params):
        log_exp_run = make_logger(name="experiment_" + self.mode)

        if not self.warm_start or not self.initialized_:
            self.initialize()

        self.X_ = X

        train_loss_acc=[]
        self.module_.to(self.device)
        optimizer = self.optimizer_
        criterion = self.criterion_
        iter_data = DataLoader(X, batch_size=self.module__batch_size, shuffle=True)

        patientia = fit_params["patientia"] if fit_params.get('fit_param') is None else fit_params["fit_param"]["patientia"]
        cont_early_stoping = fit_params["patientia"] if fit_params.get('fit_param') is None else fit_params["fit_param"]["patientia"]
        min_diference = fit_params["min_diference"] if fit_params.get('fit_param') is None else fit_params["fit_param"]["min_diference"]
        self.notify('on_train_begin', X=X, y=y)

        isinstance(optimizer,Adam)
        log_exp_run.experiments("Run using {} as optimizer".format("Adam" if isinstance(optimizer,Adam) else "SGD"))

        if isinstance(optimizer, Adam):
            log_exp_run.experiments("lr: {}".format(self.lr))
        else:
            log_exp_run.experiments("lr: {} and momentum: {}".format(self.lr, self.optimizer__momentum))

        on_epoch_kwargs = {
            'dataset_train': X,
            'dataset_valid': None,
        }

        for epoch in range(self.max_epochs):
            train_loss = 0
            self.notify('on_epoch_begin',**on_epoch_kwargs)
            for bach in iter_data:
                self.module_.zero_grad()
                x_train = bach['features'].type(torch.LongTensor)
                y_train = bach['labels'].type(torch.LongTensor)
                x_train = x_train.to(self.device)
                y_train = y_train.to(self.device)
                self.notify("on_batch_begin", X=x_train, y=y_train, training=True)
                prob = self.module_(x_train)
                loss = criterion(prob, y_train)
                loss.backward()
                train_loss += loss.item()
                optimizer.step()
                self.notify("on_batch_end", X=x_train, y=y_train, training=True)

            log_exp_run.experiments("Epoch ran: " + str(epoch) + " loss: " + str(train_loss))
            train_loss_acc.append(train_loss)

            self.notify('on_epoch_end',**on_epoch_kwargs)
            if len(train_loss_acc) > 1:
                if abs(train_loss_acc[epoch - 1] - train_loss_acc[epoch - 2]) < min_diference:
                    cont_early_stoping -= 1
                else:
                    cont_early_stoping = patientia

            if cont_early_stoping == 0:
                break

        log_exp_run.experiments("Train loss series:")
        log_exp_run.experiments(train_loss_acc)
        self.notify('on_train_end', X=X, y=y)
        return self
Esempio n. 10
0
 def initialize_module(self,*args,**kargs):
     super().initialize_module(*args, **kargs)
     param_length = sum([p.numel() for p in self.module_.parameters() if p.requires_grad])
     log_exp_run = make_logger(name="experiment_" + self.mode)
     log_exp_run.experiments("Amount of parameters: " + str(param_length))
     return self
Esempio n. 11
0
import torch
import os
from utils.custom_dataloader import CustomDataLoader
from utils.logging_custom import make_logger
from utils.file_arguments_reader import load_param_from_file
from scripts.main_gradient_based import train_model_sgd

if __name__ == "__main__":
    # Load train arguments from file
    os.chdir("../")
    wdir = os.getcwd() + "/" if not os.path.exists(
        "/home/CLUSTER/uclv_ogtoledano/doctorado/Text_Cat_Based_EDA/"
    ) else "/home/CLUSTER/uclv_ogtoledano/doctorado/Text_Cat_Based_EDA/"  # only slurm cluster
    dic_param = load_param_from_file(wdir + "scripts/arguments.txt")
    log_exp_run = make_logger(name="" +
                              dic_param['name_log_experiments_result'])
    device = "cuda:" + str(
        dic_param['cuda_device_id']) if torch.cuda.is_available() else "cpu"

    # Load pre-trained word embedding model with specific language: Spanish or English
    tensor_embedding = build_spanish_glove_from_pretrained(wdir + 'utils/pretrained_models',
                                                           wdir + 'datasets/' + dic_param['dataset_dictionary']) if \
    dic_param['word_embedding_pretrained_glove_language'] == 'Spanish' \
        else build_glove_from_pretrained(wdir + 'utils/pretrained_models',
                                         wdir + 'datasets/' + dic_param['dataset_dictionary'])

    # Create lazy Dataloader from Tensor dataset
    train_data = CustomDataLoader(wdir + 'datasets/' +
                                  dic_param['dataset_train'])
    test_data = CustomDataLoader(wdir + 'datasets/' +
                                 dic_param['dataset_test'])
Esempio n. 12
0
def build_dataset_and_dict():
    path_dataset = "C:\\Users\\Laptop\\Desktop\\ecured_five_tags"

    labels = ['ciencia', 'cultura', 'deporte', 'historia', 'salud']

    y = []
    X = []

    for i, label in enumerate(labels):
        folder = path_dataset + "/" + label
        for file in os.listdir(folder):
            total_text = []
            f = open(folder + "/" + file, "r", encoding='UTF-8')
            text = f.read()
            parse = parse_documents_from_html_format(text)
            for pattern in parse.data:
                X.append(pattern)
                total_text.append(pattern)
                y.append(i)
        total_length, count_sw = stop_words_count_and_length(total_text)
        print("Found tokens: {} for label: {}, and count stop-words {}".format(
            total_length, label, count_sw))
        total_text = []

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=142)
    bbc_train = {'data': X_train, 'target': y_train}
    bbc_test = {'data': X_test, 'target': y_test}
    texts = bbc_train['data']
    labels_target = bbc_train['target']
    os.chdir("../")
    log_exp_run = make_logger()

    log_exp_run.experiments("Number of instances for training: ")
    log_exp_run.experiments(len(bbc_train['data']))
    log_exp_run.experiments("Number of instances for testing: ")
    log_exp_run.experiments(len(bbc_test['data']))

    removing_stop_words(texts)
    dataset_train = {'features': [], 'labels': []}
    max_sequence_length = 1000
    max_nb_words = 2000
    tokenizer = Tokenizer(num_words=max_nb_words)
    tokenizer.fit_on_texts(texts)
    sequences_train = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index

    log_exp_run.experiments("Found unique tokens: " + str(len(word_index)))

    wdir = os.getcwd()
    if not os.path.exists(wdir + '/datasets/dataset_train_ecured_nosw'):
        dataset_train['features'] = pad_sequences(
            sequences_train, maxlen=max_sequence_length)  #[0:5]
        dataset_train['labels'] = labels_target  #[0:5]
        torch.save(dataset_train, wdir + "/datasets/dataset_train_ecured_nosw")

    dataset_test = {'features': [], 'labels': []}
    texts = bbc_test['data']
    labels_target = bbc_test['target']
    removing_stop_words(texts)
    sequences_test = tokenizer.texts_to_sequences(texts)

    if not os.path.exists(wdir + '/datasets/dataset_test_ecured_nosw'):
        dataset_test['features'] = pad_sequences(
            sequences_test, maxlen=max_sequence_length)  #[0:5]
        dataset_test['labels'] = labels_target  #[0:5]
        torch.save(dataset_test, wdir + "/datasets/dataset_test_ecured_nosw")

    if not os.path.exists(wdir + '/datasets/dictionary_ecured_nosw'):
        torch.save(word_index, wdir + "/datasets/dictionary_ecured_nosw")
Esempio n. 13
0
def build_dataset_and_dict():
    path_dataset = "C:\\Users\\StarWar\\Desktop\\Cora_enrich"

    file_train = path_dataset + "/texts.txt"
    file_labels = path_dataset + "/labels.txt"

    f = open(file_train, "r")
    X = f.readlines()
    f = open(file_labels, "r")
    y = f.readlines()

    vocab = {}
    index = 0
    for i in range(len(y)):
        word = y[i].split("\n")[0].lower()
        if word not in vocab:
            vocab[word] = index
            index += 1

    for i in range(len(y)):
        word = y[i].split("\n")[0].lower()
        if word in vocab:
            y[i] = vocab[word]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=142)
    bbc_train = {'data': X_train, 'target': y_train}
    bbc_test = {'data': X_test, 'target': y_test}
    texts = bbc_train['data']
    labels_target = bbc_train['target']

    log_exp_run = make_logger()
    log_exp_run.experiments("Labels: ")
    log_exp_run.experiments(vocab)
    log_exp_run.experiments("Number of instances for training: ")
    log_exp_run.experiments(len(bbc_train['data']))
    log_exp_run.experiments("Number of instances for testing: ")
    log_exp_run.experiments(len(bbc_test['data']))

    removing_stop_words(texts)
    dataset_train = {'features': [], 'labels': []}
    max_sequence_length = 1000
    max_nb_words = 2000
    tokenizer = Tokenizer(num_words=max_nb_words)
    tokenizer.fit_on_texts(texts)
    sequences_train = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    log_exp_run.experiments("Found unique tokens: " + str(len(word_index)))

    wdir = os.getcwd()
    if not os.path.exists(wdir + '/datasets/dataset_train_cora_enrich_nosw'):
        dataset_train['features'] = pad_sequences(
            sequences_train, maxlen=max_sequence_length)  #[0:5]
        dataset_train['labels'] = labels_target  #[0:5]
        torch.save(dataset_train,
                   wdir + "/datasets/dataset_train_cora_enrich_nosw")

    dataset_test = {'features': [], 'labels': []}
    texts = bbc_test['data']
    labels_target = bbc_test['target']
    removing_stop_words(texts)
    sequences_test = tokenizer.texts_to_sequences(texts)

    if not os.path.exists(wdir + '/datasets/dataset_test_cora_enrich_nosw'):
        dataset_test['features'] = pad_sequences(
            sequences_test, maxlen=max_sequence_length)  #[0:5]
        dataset_test['labels'] = labels_target  #[0:5]
        torch.save(dataset_test,
                   wdir + "/datasets/dataset_test_cora_enrich_nosw")

    if not os.path.exists(wdir + '/datasets/dictionary_cora_enrich_nosw'):
        torch.save(word_index, wdir + "/datasets/dictionary_cora_enrich_nosw")
Esempio n. 14
0
def build_dataset_and_dict():
    path_dataset = "C:\\Users\\StarWar\\Desktop\\AGnews"
    X_train = []
    y_train = []

    X_test = []
    y_test = []
    labels = ['World', 'Sports', 'Business', 'Sci/Tech']

    file_train = path_dataset + "/train.txt"
    file_test = path_dataset + "/test.txt"

    f = open(file_train, "r")
    lines = f.readlines()
    for line in lines:
        text = line.split(',')
        X_train.append(text[1] + " " + text[2])
        y_train.append(text[0][1])

    f = open(file_test, "r")
    lines = f.readlines()
    for line in lines:
        text = line.split(',')
        X_test.append(text[1] + " " + text[2])
        y_test.append(text[0][1])

    bbc_train = {'data': X_train, 'target': y_train}
    bbc_test = {'data': X_test, 'target': y_test}

    texts = bbc_train['data']
    labels_target = bbc_train['target']

    print(texts[0])

    log_exp_run = make_logger()
    log_exp_run.experiments("Categories-labels: ")
    log_exp_run.experiments(labels)
    log_exp_run.experiments("Number of instances for training: ")
    log_exp_run.experiments(len(bbc_train['data']))
    log_exp_run.experiments("Number of instances for testing: ")
    log_exp_run.experiments(len(bbc_test['data']))

    removing_stop_words(texts)
    dataset_train = {'features': [], 'labels': []}
    max_sequence_length = 1000
    max_nb_words = 2000
    tokenizer = Tokenizer(num_words=max_nb_words)
    tokenizer.fit_on_texts(texts)
    sequences_train = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    log_exp_run.experiments("Found unique tokens: " + str(len(word_index)))

    wdir = os.getcwd()
    if not os.path.exists(wdir + '/datasets/dataset_train_ag_news_nosw'):
        dataset_train['features'] = pad_sequences(
            sequences_train, maxlen=max_sequence_length)  #[0:5]
        dataset_train['labels'] = labels_target  #[0:5]
        torch.save(dataset_train,
                   wdir + "/datasets/dataset_train_ag_news_nosw")

    dataset_test = {'features': [], 'labels': []}
    texts = bbc_test['data']
    labels_target = bbc_test['target']
    removing_stop_words(texts)
    sequences_test = tokenizer.texts_to_sequences(texts)

    if not os.path.exists(wdir + '/datasets/dataset_test_ag_news_nosw'):
        dataset_test['features'] = pad_sequences(
            sequences_test, maxlen=max_sequence_length)  #[0:5]
        dataset_test['labels'] = labels_target  #[0:5]
        torch.save(dataset_test, wdir + "/datasets/dataset_test_ag_news_nosw")

    if not os.path.exists(wdir + '/datasets/dictionary_ag_news_nosw'):
        torch.save(word_index, wdir + "/datasets/dictionary_ag_news_nosw")
Esempio n. 15
0
def build_dataset_and_dict():
    os.chdir('../')
    path_dataset = "C:\\Users\\Laptop\\Desktop\\bbc"
    X = []
    y = []
    labels = ['business', 'entertainment', 'politics', 'sport', 'tech']

    for i, label in enumerate(labels):
        folder = path_dataset + "/" + label
        for file in os.listdir(folder):
            f = open(folder + "/" + file, "r")
            text = f.read()
            X.append(text)
            y.append(i)

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=12345)
    bbc_train = {'data': x_train, 'target': y_train}
    bbc_test = {'data': x_test, 'target': y_test}

    texts = bbc_train['data']
    labels_target = bbc_train['target']

    print(texts[0])

    log_exp_run = make_logger()
    log_exp_run.experiments("Categories-labels: ")
    log_exp_run.experiments(labels)
    log_exp_run.experiments("Number of instances for training: ")
    log_exp_run.experiments(len(bbc_train['data']))
    log_exp_run.experiments("Number of instances for testing: ")
    log_exp_run.experiments(len(bbc_test['data']))

    removing_stop_words(texts)
    dataset_train = {'features': [], 'labels': []}
    max_sequence_length = 1000
    max_nb_words = 2000
    tokenizer = Tokenizer(num_words=max_nb_words)
    tokenizer.fit_on_texts(texts)
    sequences_train = tokenizer.texts_to_sequences(texts)

    wdir = os.getcwd()
    if not os.path.exists(wdir + '/datasets/dataset_train_bbc_news_nosw'):
        dataset_train['features'] = pad_sequences(
            sequences_train, maxlen=max_sequence_length)  #[0:5]
        dataset_train['labels'] = labels_target  #[0:5]
        torch.save(dataset_train,
                   wdir + "/datasets/dataset_train_bbc_news_nosw")

    dataset_test = {'features': [], 'labels': []}
    texts = bbc_test['data']
    labels_target = bbc_test['target']
    removing_stop_words(texts)

    sequences_test = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    log_exp_run.experiments("Found unique tokens: " + str(len(word_index)))

    if not os.path.exists(wdir + '/datasets/dataset_test_bbc_news_nosw'):
        dataset_test['features'] = pad_sequences(
            sequences_test, maxlen=max_sequence_length)  #[0:5]
        dataset_test['labels'] = labels_target  #[0:5]
        torch.save(dataset_test, wdir + "/datasets/dataset_test_bbc_news_nosw")

    if not os.path.exists(wdir + '/datasets/dictionary_bbc_news_nosw'):
        torch.save(word_index, wdir + "/datasets/dictionary_bbc_news_nosw")