Beispiel #1
0
    def setup(self, stage=None):
        """
        Downloads the data, parse it and split the data into train, test, validation data

        :param stage: Stage - training or testing
        """
        # reading  the input
        td.AG_NEWS(root="data", split=("train", "test"))
        extracted_files = os.listdir("data")

        train_csv_path = None
        for fname in extracted_files:
            if fname.endswith("train.csv"):
                train_csv_path = os.path.join(os.getcwd(), "data", fname)

        df = pd.read_csv(train_csv_path)

        df.columns = [
            "label",
            "title",
            "description",
        ]
        df.sample(frac=1)
        df = df.iloc[:self.NUM_SAMPLES_COUNT]

        df["label"] = df.label.apply(self.process_label)

        if not os.path.isfile(self.VOCAB_FILE):
            filePointer = requests.get(
                self.VOCAB_FILE_URL,
                allow_redirects=True,
            )
            if filePointer.ok:
                with open(self.VOCAB_FILE, "wb") as f:
                    f.write(filePointer.content)
            else:
                raise RuntimeError("Error in fetching the vocab file")

        self.tokenizer = BertTokenizer(self.VOCAB_FILE)

        RANDOM_SEED = 42
        seed_everything(RANDOM_SEED)

        df_train, df_test = train_test_split(
            df,
            test_size=0.2,
            random_state=RANDOM_SEED,
            stratify=df["label"],
        )
        df_train, df_val = train_test_split(
            df_train,
            test_size=0.25,
            random_state=RANDOM_SEED,
            stratify=df_train["label"],
        )

        self.df_train = df_train
        self.df_test = df_test
        self.df_val = df_val
def get_ag_news(num_samples):
    # reading the input
    td.AG_NEWS(root="data", split=("train", "test"))
    train_csv_path = "data/AG_NEWS/train.csv"
    return (pd.read_csv(
        train_csv_path, usecols=[0, 2], names=["label", "description"]).assign(
            label=lambda df: df["label"] - 1)  # make labels zero-based
            .sample(n=num_samples))
Beispiel #3
0
    def prepare_data(self):
        """
        Creates train, valid and test dataloaders from the csv data
        """
        td.AG_NEWS(root="data", split=("train", "test"))
        extracted_files = os.listdir("data/AG_NEWS")

        train_csv_path = None
        for fname in extracted_files:
            if fname.endswith("train.csv"):
                train_csv_path = os.path.join(os.getcwd(), "data/AG_NEWS",
                                              fname)

        self.df = pd.read_csv(train_csv_path)

        self.df.columns = ["label", "title", "description"]
        self.df.sample(frac=1)
        self.df = self.df.iloc[:self.NUM_SAMPLES_COUNT]

        self.df["label"] = self.df.label.apply(self.process_label)

        if not os.path.isfile(self.VOCAB_FILE):
            filePointer = requests.get(self.VOCAB_FILE_URL,
                                       allow_redirects=True)
            if filePointer.ok:
                with open(self.VOCAB_FILE, "wb") as f:
                    f.write(filePointer.content)
            else:
                raise RuntimeError("Error in fetching the vocab file")

        self.tokenizer = BertTokenizer(self.VOCAB_FILE)

        RANDOM_SEED = 42
        np.random.seed(RANDOM_SEED)
        torch.manual_seed(RANDOM_SEED)

        self.df_train, self.df_test = train_test_split(
            self.df,
            test_size=0.1,
            random_state=RANDOM_SEED,
            stratify=self.df["label"])
        self.df_val, self.df_test = train_test_split(
            self.df_test,
            test_size=0.5,
            random_state=RANDOM_SEED,
            stratify=self.df_test["label"])

        self.train_data_loader = self.create_data_loader(
            self.df_train, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE)
        self.val_data_loader = self.create_data_loader(self.df_val,
                                                       self.tokenizer,
                                                       self.MAX_LEN,
                                                       self.BATCH_SIZE)
        self.test_data_loader = self.create_data_loader(
            self.df_test, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE)
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchtext import data, datasets
from torchtext.vocab import Vocab

from sentiment_classification.model import TextClassificationModel

SEED = 0
PATH = './model_path/sentiment_model.pth'
model_metadata = './model_path/metadata.json'
vocab_data = './model_path/vocab.pk'

torch.manual_seed(SEED)

tokenizer = data.utils.get_tokenizer('basic_english')
train_iter, test_iter = datasets.AG_NEWS(split=('train', 'test'))
counter = Counter()
label_set = set()
for (label_, line) in train_iter:
    counter.update(tokenizer(line))
    label_set.add(label_)
vocab = Vocab(counter, min_freq=1)
vocab_size = len(vocab)
EMBEDDING_SIZE = 64
label_size = len(label_set)

with open(model_metadata, 'w') as fp:
    json.dump(
        {
            'embedding_size': EMBEDDING_SIZE,
            'label_size': label_size,