Esempi in Python per read_file, esempi in Python per preprocesing.read_file

Esempio n. 1

0

Mostra file

File: models.py Progetto: costisst/TextMiningCW01

def bilstm_testing(path_test, path_model, path_eval_result, lowercase, pre_train, D):
    data = read_file(path_test)
    data = set(data)
    
    model = torch.load(path_model)
    model.eval()
    
    # Create dictionary based on data
    testing_questions,testing_tags ,my_dictionary = create_questions_tag_dict(data,lowercase)
    
    if pre_train == 'False':
        # Preprocess the dictionary
        my_dictionary = build_dictionary(my_dictionary)
        
    else:     
        # Create dictionary with pretrained vectors
        glove_dict,glove_dict_words = pre_trained_dictionary()
        
        # Indexes for pretrained vectors
        pretrained_vectors = list(glove_dict.values())
        
        weights = torch.FloatTensor(pretrained_vectors)
        
    data = prepare_data(testing_questions,testing_tags)
    
    # Tag dict
    unique_tags = set(tags)
    
        
    word_to_ix = {}
    for sent, tags in data:
        for word in sent:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    tag_to_ix = prepare_tags_to_ix(unique_tags)

Esempio n. 2

0

Mostra file

File: models.py Progetto: costisst/TextMiningCW01

def bag_of_words(question,my_dictionary):
    stopwords = read_file('../data/stopwords.txt')
    words = question.split(' ')
    if ' ' in words:
        words = [w for w in words if w != ' ']
    if '' in words:
        words = [w for w in words if w != '']
    words = [clean_numbers(w) for w in words]
    words = [w for w in words if w not in stopwords]
    words = ['#UNK#' if w not in my_dictionary else w for w in words ]
    indexes = [int(my_dictionary[w]) for w in words]
        
    return indexes

Esempio n. 3

0

Mostra file

File: models.py Progetto: costisst/TextMiningCW01

def create_train_dev_set(config):
    data = read_file("data/data.txt")
    # Remove the same examples
    data = set(data)
    data = list(data)

    # Create Training and Testing sets
    training_set = data[:math.floor(0.9*len(data))]

    dev_set = data[math.floor(0.9*len(data)):]
    
    with open('data/train.txt', 'w') as filehandle:
        filehandle.writelines("%s\n" % place for place in training_set)
    with open('data/dev.txt', 'w') as filehandle:
        filehandle.writelines("%s\n" % place for place in dev_set)

Esempio n. 4

0

Mostra file

File: test_qc2_batch.py Progetto: costisst/TextMiningCW01

"""
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np
from preprocesing import read_file
from models import create_questions_tag_dict, bag_of_words, pre_trained_dictionary
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable

if __name__ == '__main__':
    # Load data
    data = read_file("data/data.txt")
    random.Random(5).shuffle(data)
    data = set(data)
    D = 300
    torch.manual_seed(5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #    device = torch.device("cpu")

    # Create question and tag lists
    questions, tags, _ = create_questions_tag_dict(data, True)

    # Create dictionary with pretrained vectors
    glove_dict, glove_dict_words = pre_trained_dictionary()

    # Get number of words in dict
    N = len(glove_dict_words.keys())

Esempio n. 5

0

Mostra file

def bow(config, train_or_test):
    # Load data
    # Only call this once to create your train,dev dataset
    #create_train_dev_set(config)

    # Red train and dev data from files
    train_data = read_file(config.path_train)
    train_data.remove('')
    dev_data = read_file(config.path_dev)
    dev_data.remove('')
    test_data = read_file(config.path_test)
    test_data.remove('')

    #random.Random(5).shuffle(train_data)
    #random.Random(5).shuffle(dev_data)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create questions and tags list and the vocabulary
    questions_train, tags_train, vocabulary = create_questions_tag_dict(
        train_data, config.lowercase)
    questions_dev, tags_dev, _ = create_questions_tag_dict(
        train_data, config.lowercase)
    questions_test, tags_test, _ = create_questions_tag_dict(
        test_data, config.lowercase)

    EMBEDDING_DIM = config.word_embedding_dim
    pretrained_vectors = []
    torch.manual_seed(5)

    if config.pre_train == 'False':
        # Preprocess the dictionary
        vocabulary = build_dictionary(vocabulary)
        vocabulary['Padding'] = len(vocabulary.keys())
    elif config.pre_train == 'True':
        EMBEDDING_DIM = 300
        # Create dictionary with pretrained vectors
        glove_dict, vocabulary = pre_trained_dictionary()
        # Indices for pretrained vectors
        pretrained_vectors = list(glove_dict.values())
        # Add vector of zeroes to represent padding in vocabulary
        vector_of_zeroes = np.zeros([EMBEDDING_DIM])
        pretrained_vectors.append(vector_of_zeroes)

    # Get number of words in dict
    N = len(vocabulary.keys())

    # Target dictionary, each target is assigned to a different number
    unique_tags = set(tags_train)
    tag_dict = {word: i for i, word in enumerate(unique_tags)}

    # Prepare data for Neural Network
    bow_list = []
    target_list = []
    for question, tag in zip(questions_train, tags_train):
        # Bag of words on the dataset
        bow_vec = bag_of_words(question, vocabulary)
        bow_list.append(bow_vec)
        # Get target index value from target dictionary
        target_list.append(tag_dict[tag])

    bow_list_validation = []
    target_list_validation = []
    # Same for validation sets
    for question, tag in zip(questions_dev, tags_dev):
        bow_vec = bag_of_words(question, vocabulary)
        bow_list_validation.append(bow_vec)
        # Produce target
        target_list_validation.append(tag_dict[tag])

    bow_list_test = []
    target_list_test = []
    # Same for validation sets
    for question, tag in zip(questions_test, tags_test):
        bow_vec = bag_of_words(question, vocabulary)
        bow_list_test.append(bow_vec)
        # Produce target
        target_list_test.append(tag_dict[tag])

    # Create model
    model = FeedForwardNN(pretrained_vectors, config, N).to(device)

    batch_size = config.batch_size
    number_of_batches = math.ceil(len(bow_list) / batch_size)

    if train_or_test == 'train':
        # Training
        train_model(number_of_batches, bow_list, target_list,
                    bow_list_validation, target_list_validation, model, device,
                    N, config)
    elif train_or_test == 'test':
        # Testing
        metrics = test_model(bow_list_test, target_list_test, device, tag_dict,
                             config)
        print(metrics)

Esempio n. 6

0

Mostra file

File: models.py Progetto: costisst/TextMiningCW01

def bilstm_training(config):
    # Only call this once to create your 
    #create_train_dev_set(config)

    # Red train and dev data from files
    training_data = read_file(config.path_train)
    training_data.remove('')
    dev_data = read_file(config.path_dev)
    dev_data.remove('') 
    
    # Create dictionary based on data
    questions,tags ,my_dictionary = create_questions_tag_dict(training_data,config.lowercase)

    EMBEDDING_DIM = config.word_embedding_dim
    HIDDEN_DIM = config.word_embedding_dim
    
    if config.pre_train == 'False':
        # Preprocess the dictionary
        my_dictionary = build_dictionary(my_dictionary)
        
        data = prepare_data(questions,tags)
        
        # Tag dict
        unique_tags = set(tags)
        
    else: 
        EMBEDDING_DIM = 300
        HIDDEN_DIM = 300
        
        # Create dictionary with pretrained vectors
        glove_dict,glove_dict_words = pre_trained_dictionary()
        
        # Tag dict
        unique_tags = set(tags)
        
        # Indexes for pretrained vectors
        pretrained_vectors = list(glove_dict.values())
        
        weights = torch.FloatTensor(pretrained_vectors)
        
        data = prepare_data(questions,tags)
        
        # Tag dict
        unique_tags = set(tags)
        

    word_to_ix = {}
    for sent, tags in data:
        for word in sent:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    tag_to_ix = prepare_tags_to_ix(unique_tags)
    training_data = data[:math.floor(0.9*len(data))]
    testing_data = data[math.floor(0.9*len(data)):]


    class LSTMTagger(nn.Module):
        # if pre_train = true --> vocab_size = weights
        def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
            '''Returns an biLSTM model.'''
            super(LSTMTagger, self).__init__()
            self.hidden_dim = hidden_dim
            if config.pre_train == 'False':
                self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
            else:
                self.word_embeddings = nn.Embedding.from_pretrained(vocab_size)
    
            # The LSTM takes word embeddings as inputs, and outputs hidden states
            # with dimensionality hidden_dim.
            self.lstm = nn.LSTM(embedding_dim, hidden_dim,bidirectional=True)
    
            # The linear layer that maps from hidden state space to tag space
            self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
    
        def forward(self, sentence):
            '''Passes the questions through the model'''
            embeds = self.word_embeddings(sentence)
            lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
            tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
            tag_scores = F.log_softmax(tag_space, dim=1)
            return tag_scores

        def save_model(self):
            torch.save(self, "kostis.bow")
    
    if config.pre_train == 'False':
        model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
    else:
        model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, weights, len(tag_to_ix))
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), config.lr_param)
    

    test_score = []
    train_score = []
    for epoch in range(config.epoch):  # again, normally you would NOT do 300 epochs, it is toy data
        running_loss = 0
        count_samples = 0
        count_correct = 0
        count_samples_test = 0
        count_correct_test = 0
        for sentence, tags in training_data:
            count_samples += 1
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()
    
            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)
    
            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)
    
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            
            if torch.eq(targets[-1], torch.exp(tag_scores[-1]).argmax()):
                count_correct += 1
                
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
        # Validation/Dev testing
        for sentence, tags in testing_data:
                
            count_samples_test += 1
            
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)
              
            tag_scores = model(sentence_in)
            
            loss = loss_function(tag_scores, targets)
            
            if torch.eq(targets[-1], torch.exp(tag_scores[-1]).argmax()):
                count_correct_test += 1
                
            running_loss += loss.item()
        
        test_score.append([count_correct_test/count_samples_test])
        train_score.append([count_correct/count_samples])
        print("Epochs: {0}".format(epoch))    
        print("Training loss: {0}".format(count_correct/count_samples))
        print("Testing loss: {0}".format(count_correct_test/count_samples_test))
        
    model.save_model()

Esempio n. 7

0

Mostra file

import sys
from preprocesing import read_file
from models import bilstm_training,bilstm_testing
from config import Config
from bow_model import *


if __name__ == '__main__':
    arguments = sys.argv[1:]
    
    if len(arguments) < 2:
        print('Please provide train/test -config and the config file path')
        exit(1)
        
    config_path = arguments[2]
    conf = read_file(config_path)
    config = Config(conf)
        
    if arguments[0] == 'train':
        # Training
        if config.model == 'bow':
            # Bow training
            print('\nBow_Training\n------------')
            bow(config,'train')
        if config.model == 'bilstm':
            # BiLSTM training
            print('\nbiLSTM_Training\n------------')
            bilstm_training(config)        
        
    elif arguments[0] == 'test':
        # Testing