def get_imdb_data(embedding_size=50): """Renvoie l'ensemble des donnéees nécessaires pour l'apprentissage - dictionnaire word vers ID - embeddings (Glove) - DataSet (FolderText) """ WORDS = re.compile(r"\S+") words, embeddings = prepare_dataset('edu.stanford.glove.6b.%d' % embedding_size).load() OOVID = len(words) words.append("__OOV__") word2id = {word: ix for ix, word in enumerate(words)} embeddings = np.vstack((embeddings, np.zeros(embedding_size))) def tokenizer(t): return [word2id.get(x, OOVID) for x in re.findall(WORDS, t.lower())] logging.info("Loading embeddings") logging.info("Get the IMDB dataset") ds = prepare_dataset("edu.stanford.aclimdb") return word2id, embeddings, FolderText(ds.train.classes, ds.train.path, tokenizer, load=False), FolderText( ds.test.classes, ds.test.path, tokenizer, load=False)
def _msmarco(part: str): return Adhoc( documents=_msmarco_docs(), topics=prepare_dataset( f"com.microsoft.msmarco.passage.{part}.queries"), assessments=prepare_dataset( f"com.microsoft.msmarco.passage.{part}.qrels"), )
def __init__(self): self.index_stem = prepare_dataset( "ca.uwaterloo.jimmylin.anserini.robust04") self.topics = prepare_dataset("gov.nist.trec.adhoc.robust.2004.topics") self.qrels = prepare_dataset("gov.nist.trec.adhoc.robust.2004.qrels") # FIXME: parallelize (when experimaestro supports this) self.docstore = BuildDocStore(index=self.index_stem).submit() self.index = Reindex(index=self.index_stem).submit()
def get(subset: str): topics = prepare_dataset( f"com.microsoft.msmarco.passage.{subset}.queries") qrels = prepare_dataset( f"com.microsoft.msmarco.passage.{subset}.qrels") assessed_topics = TrecAssessedTopics(topics=topics, assessments=qrels) return MsmarcoDataset(docstore=docstore, index=index, index_stem=index_stem, assessed_topics=assessed_topics)
def fold(name: str): """Return topics and assessments for a given fold Folds are trf1 to trf5 (test), vaf1 to vaf5 (validation) and f1 to f5 (test) """ topics = prepare_dataset("gov.nist.trec.adhoc.robust.2004.topics") qrels = prepare_dataset("gov.nist.trec.adhoc.robust.2004.qrels") train_topics = AdhocTopicFold(topics=topics, ids=sorted(list(FOLDS[name]))) train_qrels = AdhocAssessmentFold(qrels=qrels, ids=sorted(list(FOLDS[name]))) return train_topics, train_qrels
def get_dataloaders_and_vocabs(batch_size): ds = prepare_dataset('org.universaldependencies.french.gsd') words = VocabularyTagging(True) tags = VocabularyTagging(False) train_dataset = TaggingDataset(ds.files['train'], words, tags, True) val_dataset = TaggingDataset(ds.files['dev'], words, tags, False) test_dataset = TaggingDataset(ds.files['test'], words, tags, False) kwargs = dict(collate_fn=TaggingDataset.collate, pin_memory=(torch.cuda.is_available()), num_workers=torch.multiprocessing.cpu_count()) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, **kwargs) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, **kwargs) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, **kwargs) return train_loader, val_loader, test_loader, words, tags
def cli(port, workdir, dataset, debug): """Runs an experiment""" logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO) bm25 = BM25() # Sets the working directory and the name of the xp with experiment(workdir, "bm25", port=port) as xp: # Index the collection xp.setenv("JAVA_HOME", os.environ["JAVA_HOME"]) ds = prepare_dataset(dataset) documents = ds.documents index = IndexCollection( documents=documents, storePositions=True, storeDocvectors=True, storeContents=True, threads=CPU_COUNT, ).submit() # Search with BM25 bm25_retriever = AnseriniRetriever(k=1500, index=index, model=BM25()).tag("model", "bm25") bm25_eval = Evaluate(dataset=ds, retriever=bm25_retriever).submit() print("BM25 results on TREC 1") print(bm25_eval.results.read_text())
def get_dataloaders(batch_size, word2id): # load the IMDB dataset ds = prepare_dataset("edu.standford.aclimdb") # ds.train.classes and ds.test.classes are dict ('class', 'path-to-files') dev_ds = FolderText(ds.train.classes, word2id, load=False) test_ds = FolderText(ds.test.classes, word2id, load=False) # partition development set as train and validation set train_len = int(len(dev_ds) * 0.9) val_len = len(dev_ds) - train_len train_ds, val_ds = torch.utils.data.random_split(dev_ds, [train_len, val_len]) kwargs = dict(collate_fn=FolderText.collate, pin_memory=(torch.cuda.is_available()), num_workers=torch.multiprocessing.cpu_count()) train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, **kwargs) val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, **kwargs) test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=True, **kwargs) return train_loader, val_loader, test_loader
def msmarco_train_triplets(info: Information): """Use MS-Marco triplets""" info.train_sampler = TripletBasedSampler( source=prepare_dataset( "com.microsoft.msmarco.passage.train.idtriples"), index=info.index(_msmarco_docs()), )
def get_glove_embeddings(): word2id, embeddings = prepare_dataset('edu.standford.glove.6b.50').load() # add a null embedding for pad: set it to id zero, so it works when zero-padding # In order to set 0 to PAD_ID, we need to shift all the ids for the embeddings # by adding one (because the id 0 is already used) for word in word2id: word2id[word] += 1 word2id['<pad>'] = 0 embeddings = np.insert(embeddings, 0, values=0, axis=0) # add an OOV embedding: use the mean of all embeddings OOV_ID = len(word2id) word2id['<oov>'] = OOV_ID embeddings = np.insert(embeddings, OOV_ID, embeddings.mean(0), axis=0) return word2id, embeddings
def __init__(self, train=True) -> None: super().__init__() self.train = train ds = prepare_dataset("com.lecun.mnist") if self.train: train_x, train_y = ds.files["train/images"].data( ), ds.files["train/labels"].data() else: train_x, train_y = ds.files["test/images"].data( ), ds.files["test/labels"].data() train_x = np.reshape( train_x, (train_x.shape[0], train_x.shape[1] * train_x.shape[2])) / 255 self.train_x = train_x
def robust(info: Information, top_k: int): """Use the TREC Robust dataset""" from xpmir.datasets.robust import fold # Return pairs topic/qrels documents = prepare_dataset("gov.nist.trec.adhoc.robust.2004").documents def get(p: str): topics, qrels = fold(p) return Adhoc(topics=topics, assessments=qrels, documents=documents) info.train_sampler = ModelBasedSampler( retriever=AnseriniRetriever(k=top_k, index=info.index(documents), model=info.basemodel), dataset=get("trf1"), ) info.dev = get("trf1") info.test = get("f1")
def cli(vocab_size: int): # Création du jeu de données et du modèle ds = prepare_dataset("com.sentiment140") # Création du vocabulaire wpmodel = Path("wp{}.model".format(vocab_size)) if not wpmodel.is_file(): logging.info("Did not find the wordpiece model %s", wpmodel) TRAINPATH = Path("/tmp/sentiment140-train.txt") cleanup(ds.files["train"], TRAINPATH) program = """import sentencepiece as spm; spm.SentencePieceTrainer.Train('--model_prefix=wp{vocab_size} --vocab_size={vocab_size} --input={TRAINPATH}')""" subprocess.run([sys.executable, "-c", program]) TRAINPATH.unlink() # Création des jeux de données tokenizer = spm.SentencePieceProcessor() tokenizer.Load("wp{vocab_size}.model") CLASSMAP = {0: 0, 4: 1} test = generatedata("test", tokenizer, vocab_size, ds, CLASSMAP) train = generatedata("train", tokenizer, vocab_size, ds, CLASSMAP)
def prepare(): """Index the MS-Marco collection""" # Get the collection and index it collection = prepare_dataset( "com.microsoft.msmarco.passage.collection") docstore = BuildDocStore(collection=collection).submit() index = Reindex(collection=collection, stemmer="none").submit() index_stem = Reindex(collection=collection, stemmer="porter").submit() def get(subset: str): topics = prepare_dataset( f"com.microsoft.msmarco.passage.{subset}.queries") qrels = prepare_dataset( f"com.microsoft.msmarco.passage.{subset}.qrels") assessed_topics = TrecAssessedTopics(topics=topics, assessments=qrels) return MsmarcoDataset(docstore=docstore, index=index, index_stem=index_stem, assessed_topics=assessed_topics) return get
def cli(vocab_size: int): # Création du jeu de données et du modèle ds = prepare_dataset("com.sentiment140.english") # Création du vocabulaire wpmodel = Path("wp{}.model".format(vocab_size)) if not wpmodel.is_file(): logging.info("Did not find the wordpiece model %s", wpmodel) TRAINPATH = Path("sentiment140-train.txt") cleanup(ds.train.path, TRAINPATH) logging.info("Création du vocabulaire avec sentencepiece") spm.SentencePieceTrainer.train(input=str(TRAINPATH), model_prefix=f"wp{vocab_size}", vocab_size=vocab_size) TRAINPATH.unlink() # Création des jeux de données tokenizer = spm.SentencePieceProcessor() tokenizer.Load(f"wp{vocab_size}.model") CLASSMAP = {0: 0, 4: 1} logging.info("Traitement du train/test (Sentiment 140)") generatedata("test", tokenizer, vocab_size, ds.test, CLASSMAP) generatedata("train", tokenizer, vocab_size, ds.train, CLASSMAP)
import re from pathlib import Path from torch.utils.data import Dataset from datamaestro import prepare_dataset EMBEDDING_SIZE = 50 ds = prepare_dataset("edu.standford.aclimdb") word2id, embeddings = prepare_dataset('edu.standford.glove.6b.%d' % EMBEDDING_SIZE).load() class FolderText(Dataset): def __init__(self, classes, tokenizer, load=False): self.tokenizer = tokenizer self.files = [] self.filelabels = [] self.labels = list(classes.keys()) for label, folder in classes.items(): for file in folder.glob("*.txt"): self.files.append(file) self.filelabels.append(label) def __len__(self): return len(self.filelabels) def __getitem__(self, ix): return self.tokenizer(self.files[ix].read_text()), self.filelabels[ix] WORDS = re.compile(r"\S+") def tokenizer(t): return list([x for x in re.findall(WORDS, t.lower())])
from datamaestro import prepare_dataset ds = prepare_dataset("org.universaldependencies.french.gsd") print('ds')
eps = torch.randn(mu.size()) z = mu + eps * sigma return z def forward(self, x): mu, sigma = self.encode(x) z = self.reparametrization(mu, sigma) y = self.decode(z) return y, mu, sigma if __name__ == '__main__': ds = prepare_dataset("com.lecun.mnist") train_images, train_labels = ds.train.images.data(), ds.train.labels.data() test_images, test_labels = ds.test.images.data(), ds.test.labels.data() dataset_train = Mnist_dataset(train_images, train_labels) dataset_test = Mnist_dataset(test_images, test_labels) batch_size = 65 train_loader = DataLoader(dataset_train, shuffle=True, batch_size=batch_size) test_loader = DataLoader(dataset_test, shuffle=True, batch_size=batch_size) writer = SummaryWriter() savepath = "save_net/auto_encoder.model"
def _msmarco_docs(): return prepare_dataset("com.microsoft.msmarco.passage.collection")
def glove(info): from xpmir.vocab.wordvec_vocab import WordvecUnkVocab wordembs = prepare_dataset("edu.stanford.glove.6b.50") return WordvecUnkVocab(data=wordembs, random=info.random)
def forward(ctx, y, target): ctx.save_for_backward(y, target) return torch.sum(torch.pow(y - target, 2)) @staticmethod def backward(ctx, grad_outputs): y, target = ctx.saved_tensors y_grad = (2 * y - 2 * target) * grad_outputs return y_grad, None ## Pour utiliser la fonction ## Pour telecharger le dataset Boston ds = prepare_dataset("edu.uci.boston") fields, data = ds.files.data() n = data.shape[0] regling = linear1() mse = MSE() learning_rate = 0.01 # Parameters w = torch.rand(13, requires_grad=True, dtype=torch.double) b = torch.rand(1, requires_grad=True, dtype=torch.double) writer = SummaryWriter()
**kwargs) test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=True, **kwargs) return train_loader, val_loader, test_loader if __name__ == '__main__': EMBEDDING_SIZE = 50 # size of GloVe vectors # load the IMDB dataset print('Loading IMDB dataset..') ds = prepare_dataset("edu.standford.aclimdb") # load pretrained GloVe word embeddings (400k trained vectors) print('Loading GloVe embeddings..') word2id, embeddings = get_glove_embeddings() print('Vocab from GloVe: size {}, head: {}'.format(len(word2id), [ (i, w) for w, i in word2id.items() ][:40])) print('Embeddings matrix size:', type(embeddings), embeddings.shape) # ds.train.classes and ds.test.classes are dict ('class', 'path-to-files') train_dataset = FolderText(ds.train.classes, word2id, load=False) test_dataset = FolderText(ds.test.classes, word2id, load=False) print('IMDB dataset:') print('Number of training samples:', len(train_dataset))