def test_chunks_from_iter_partial_chunks_allowed(self): self.assertEqual(list(chunks_from_iter(iter(range(10)), 3)), [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]) self.assertEqual(list(chunks_from_iter(iter(range(5)), 5)), [[0, 1, 2, 3, 4]]) self.assertEqual(list(chunks_from_iter(iter(range(5)), 1)), [[0], [1], [2], [3], [4]]) self.assertEqual(list(chunks_from_iter([], 2)), [])
def test_chunks_from_iter_full_chunks_only(self): self.assertEqual( list(chunks_from_iter(iter(range(10)), 3, full_chunks_only=True)), [[0, 1, 2], [3, 4, 5], [6, 7, 8]]) self.assertEqual( list(chunks_from_iter(iter(range(5)), 5, full_chunks_only=True)), [[0, 1, 2, 3, 4]]) self.assertEqual( list(chunks_from_iter(iter(range(5)), 1, full_chunks_only=True)), [[0], [1], [2], [3], [4]]) self.assertEqual(list(chunks_from_iter([], 2, full_chunks_only=True)), [])
def _get_sample(file_path, sample_proportion=0.02): records = [] chunks_processed = 0 with open(file_path, "r", encoding='windows-1252') as f: for chunk in chunks_from_iter(f.readlines(), CHUNK_SIZE): records.extend( random.sample(chunk, int(len(chunk) * sample_proportion))) chunks_processed += 1 if chunks_processed % 100 == 0: print(f"Processed {chunks_processed} chunks") return csv_records_to_dicts(records)
def main(test_file, model_path): model = _load_model(model_path) model.eval() print(f"Running model on address file in batches of size {CHUNK_SIZE}") results = [] chunks = 0 with open(test_file, "r") as f: for chunk in chunks_from_iter(f.readlines(), CHUNK_SIZE): chunk_res = [] for add in chunk: add = add.strip() parsed_add = parse_raw_address(add, model) parsed_add["input_address"] = add chunk_res.append(parsed_add) results.extend(chunk_res) chunks += 1 if chunks % 10 == 0: print(f"Processed {chunks} chunks ({len(results)} records)")
def main(paf_sample_file, model_output_path): """ We have a structured address dataset that we can use to automatically construct a training set for address parsing. """ records = [] print("Loading address data") with open(paf_sample_file, "r") as f: for chunk in chunks_from_iter(f.readlines(), CHUNK_SIZE): records += csv_records_to_dicts(chunk) # List (encoded address, address char labels) pairs. Has to be a materialised list as this # collection will be iterated over multiple times. preprocessed_adds = list( preprocess_addresses(records, seq_length=SEQ_LENGTH)) print( f"Starting training with {len(records)} address records and a batch size of {BATCH_SIZE}" ) model = train(preprocessed_adds) print("Training complete") print("Saving model..") torch.save(model, model_output_path) print("Done!")
def main(paf_sample_file, output_file): """ We have a structured address dataset that we can use to automatically construct a training set for address parsing. """ preprocessed = [] chunks = 0 with open(paf_sample_file, "r") as f: print(f"Processing data in chunks of size {CHUNK_SIZE}") for chunk in chunks_from_iter(f.readlines(), CHUNK_SIZE): address_dicts = csv_records_to_dicts(chunk) preprocessed += list( preprocess_addresses(address_dicts, seq_length=100)) chunks += 1 if chunks % 100 == 0: print(f"Processed {chunks} chunks") print(f"Writing output to {output_file}") with open(output_file, "wb") as f_out: output = { "address_features": [t[0] for t in preprocessed], "address_labels": [t[1] for t in preprocessed] } pickle.dump(output, f_out)
def train(preprocessed_records): """ :param preprocessed_records: List of pairs of encoded_address, address_char_classes """ train_on_gpu = torch.cuda.is_available() model = AddressRNN(vocab=VOCAB, lstm_dim=LSTM_DIM, lstm_layers=LSTM_LAYERS, output_dim=OUTPUT_DIM, seq_length=SEQ_LENGTH, train_on_gpu=train_on_gpu, batch_first=True) print("Model architecture:") print(model) hidden = model.init_hidden(BATCH_SIZE) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=LR) if train_on_gpu: model.cuda() model.train() print(f"Starting training") for e in range(EPOCHS): batches = 0 loss = None accs = [] for batch in chunks_from_iter(preprocessed_records, n=BATCH_SIZE, full_chunks_only=True): X = torch.from_numpy(np.array([t[0] for t in batch])) y = torch.from_numpy(np.array([t[1] for t in batch])) if train_on_gpu: X = X.cuda() y = y.cuda() # Avoid backpropagating through the entire training history hidden = tuple([h.data for h in hidden]) optimizer.zero_grad() out, hidden = model(X, hidden) """ Align y for computing CrossEntropyLoss. We know that the shape of `out` is (BATCH_SIZE * SEQ_LENGTH, OUTPUT_DIM), i.e. a logit score per output class per character in a batch so we shape `y` to be a vector tensor of dim BATCH_SIZE * SEQ_LENGTH -> True class label per character in the batch. This is similar to the below example >>> target = torch.empty(3, dtype=torch.long).random_(5) >>> target tensor([2, 4, 4]) >>> output = torch.Tensor([[0, 0 , 20, 0, 0], [0, 0, 0, 0, 14], [0, 0, 0, 0, 25]]) >>> output tensor([[ 0., 0., 20., 0., 0.], [ 0., 0., 0., 0., 14.], [ 0., 0., 0., 0., 25.]]) >>> loss = nn.CrossEntropyLoss() >>> loss(output, target).item() 1.1126181789222755e-06 where target is a vector tensor of dim 3 and the values being some class label between 0 and 4 and output is a tensor of dim (3, 5) where each element of row i is the scores for each of the 5 classes. The small loss value shows that the cross entropy loss is behaving as expected for the example provided, since the output has high scores for the correct class labels at the corresponding indexes and zero elsewhere. """ y_reshaped = y.reshape(BATCH_SIZE * SEQ_LENGTH) loss = criterion(out, y_reshaped) loss.backward() acc = accuracy(out, y_reshaped) accs.append(acc) # To avoid exploding gradients nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() batches += 1 if batches % 10 == 0: print(f"Finished training on {batches} batches in epoch {e}") print(f"Loss so far is {loss.item()}") print(f"Average accuracy is {round(np.average(accs) * 100)}%") print(f"Finished training for epoch {e}") print(f"Loss at end of epoch {e} is {loss.item()}") return model