def get_candidate_documents(model, batch_size, dev_set=None, k_best=5, max_doc_len=-1, num_padding_tokens=0): """Get list of candidate documents for each pattern, from which best match will be selected""" num_patterns = model.total_num_patterns selected_documents = [[] for i in range(num_patterns)] for batch in chunked(dev_set, batch_size): batch_obj = Batch([x[0][0] for x in batch], model.embeddings, model.to_cuda, 0, max_doc_len) _, scores = model.forward(batch_obj, 1, 0) scores = scores.data # Adding epsilon to scores in order to not have two documents with the same score epsilon = (torch.rand(scores.size()) - 0.5) / 10000 scores += model.to_cuda(epsilon) for i in range(num_patterns): for j in range(batch_obj.size()): tup = (scores[j, i], (batch[j][0][1], batch[j][0], batch[j][1])) # print(i,j,"tup=",scores[j,i], epsilon[j,i], scores2[j,i]) heapq.heappush(selected_documents[i], tup) if len(selected_documents[i]) == (k_best + 1): heapq.heappop(selected_documents[i]) return selected_documents
def test_can_use_iterator_node(self): iterable = chunked(StringIO(data_source['mary']), chunksize=3) class D(BaseModel, self.Settings): stream = Feature(IteratorNode, store=True) words = Feature(Tokenizer, needs=stream, store=False) count = JSONFeature(WordCount, needs=words, store=True) _id = D.process(stream=iterable) doc = D(_id) self.assertEqual(data_source['mary'], doc.stream.read())
def _process(self, data): try: flo = StringIO(data_source[data]) except KeyError as e: if isinstance(data, str): flo = StringIO(data) else: raise e for chunk in chunked(flo, chunksize=self._chunksize): yield chunk
def __iter__(self, flo): metadata, _ = NumpyMetaData.unpack(flo) example_size = metadata.totalsize chunk_size = int(example_size * self.n_examples) count = 0 for chunk in chunked(flo, chunk_size): n_examples = len(chunk) // example_size yield _np_from_buffer(chunk, (n_examples, ) + metadata.shape, metadata.dtype) count += 1 if count == 0: yield _np_from_buffer(buffer(''), (0, ) + metadata.shape, metadata.dtype)
def create_placement_groups(job_id, node_list, partition_name): PLACEMENT_MAX_CNT = 150 groups = { f"{cfg.slurm_cluster_name}-{partition_name}-{job_id}-{i}": nodes for i, nodes in enumerate(chunked(node_list, n=PLACEMENT_MAX_CNT)) } reverse_groups = {node: group for group, nodes in groups.items() for node in nodes} model = next(iter(node_list)) region = lkp.node_region(model) requests = { group: create_placement_request(group, region) for group, incl_nodes in groups.items() } done, failed = batch_execute(requests) if failed: reqs = [f"{e}" for _, e in failed.values()] log.fatal("failed to create placement policies: {}".format("\n".join(reqs))) log.info(f"created {len(done)} placement groups ({','.join(done.keys())})") return reverse_groups
def interpret_documents(model, batch_size, dev_data, dev_text, ofile, max_doc_len): j = 0 with open(ofile, "w") as ofh: for batch_idx, chunk in enumerate(chunked(dev_data, batch_size)): batch = Batch([x for x, y in chunk], model.embeddings, model.to_cuda) res, scores = model.forward(batch, 1) print("ss", scores.size()) output = softmax(res).data predictions = [int(x) for x in argmax(output)] num_patts = scores.size()[1] diffs = np.zeros((num_patts, batch.size())) # Traversing all patterns. for i in range(num_patts): # Copying scores data to numpy array. scores_data = np.array(scores.data.numpy(), copy=True) # Zeroing out pattern number i across batch scores_data[:, i] = 0 # Running mlp.forward() with zeroed out scores. forwarded = softmax( model.mlp.forward(Variable( torch.FloatTensor(scores_data)))).data.numpy() # Computing difference between forwarded scores and original scores. for k in range(batch.size()): # diffs[i,k] = output[k, predictions[k]] - \ # output[k, 1 - predictions[k]] - \ # forwarded[k, predictions[k]] + \ # forwarded[k, 1 - predictions[k]] diffs[i, k] = forwarded[k, 1 - predictions[k]] - output[ k, 1 - predictions[k]] # Now, traversing documents in batch for i in range(batch.size()): # Document string text_str = str(" ".join(dev_text[j]).encode('utf-8'))[2:-1] # Top ten patterns with largest differences between leave-one-out score and original score. top_ten_deltas = sorted(enumerate(diffs[:, i]), key=lambda x: x[1], reverse=True)[:10] top_ten_neg_deltas = sorted(enumerate(diffs[:, i]), key=lambda x: x[1])[:10] # Top ten patterns with largest overall score (regardless of classification) top_ten_scores = sorted(enumerate(scores.data.numpy()[i, :]), key=lambda x: x[1], reverse=True)[:10] top_scoring_spans = get_top_scoring_spans_for_doc( model, dev_data[j], max_doc_len) # Printing out everything. ofh.write( "{} {} {} All in, predicted: {:>2,.3f} All in, not-predicted: {:>2,.3f} Leave one out: +res: {} -res: {} Patt scores: {}\n" .format( dev_data[j][1], predictions[i], text_str, output[i, predictions[i]], output[i, 1 - predictions[i]], " ".join([ "{}:{:>2,.3f}".format(i, x) for (i, x) in top_ten_deltas ]), " ".join([ "{}:{:>2,.3f}".format(i, x) for (i, x) in top_ten_neg_deltas ]), " ".join([ "{}:{:>2,.3f}".format(i, x) for (i, x) in top_ten_scores ]))) ofh.write("Top ten deltas:\n") for l in top_ten_deltas: s = top_scoring_spans[l[0]].display(dev_text[j]) ofh.write( str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] + "\n") ofh.write("Top ten negative deltas:\n") for l in top_ten_neg_deltas: s = top_scoring_spans[l[0]].display(dev_text[j]) ofh.write( str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] + "\n") j += 1
def _generator(self, stream, content_length): if not content_length: raise ValueError('content_length should be greater than zero') for chunk in chunked(stream, chunksize=self._chunksize): yield StringWithTotalLength(chunk, content_length)
def __iter__(self, flo): self._total_length = struct.unpack('I', flo.read(4))[0] for chunk in chunked(flo, self._chunksize): yield StringWithTotalLength(chunk, self._total_length)
def __iter__(self, flo): decompressor = bz2.BZ2Decompressor() for chunk in chunked(flo): yield decompressor.decompress(chunk)
def __iter__(self, flo): for chunk in chunked(flo): yield chunk
def _process(self, data): flo = StringIO(data_source['mary']) for chunk in chunked(flo, chunksize=3): yield chunk
def _process(self, data): flo = StringIO(data) for chunk in chunked(flo, chunksize=self._chunksize): yield chunk
def resume_nodes(nodelist, placement_groups=None, exclusive=False): """resume nodes in nodelist""" def ident_key(n): # ident here will refer to the combination of partition and group return "-".join( ( lkp.node_partition_name(n), lkp.node_group_name(n), ) ) # support already expanded list nodes = nodelist if isinstance(nodes, str): nodelist = expand_nodelist(nodelist) nodes = sorted(nodelist, key=ident_key) if len(nodes) == 0: return grouped_nodes = { ident: chunk for ident, nodes in groupby(nodes, ident_key) for chunk in chunked(nodes, n=BULK_INSERT_LIMIT) } log.debug(f"grouped_nodes: {grouped_nodes}") # make all bulkInsert requests and execute with batch inserts = { ident: create_instances_request(nodes, placement_groups, exclusive) for ident, nodes in grouped_nodes.items() } started, failed = batch_execute(inserts) if failed: failed_reqs = [f"{e}" for _, (_, e) in failed.items()] log.error("bulkInsert API failures: {}".format("\n".join(failed_reqs))) for ident, (_, exc) in failed.items(): down_nodes(grouped_nodes[ident], exc._get_reason()) # wait for all bulkInserts to complete and log any errors bulk_operations = [wait_for_operation(op) for op in started.values()] for bulk_op in bulk_operations: if "error" in bulk_op: error = bulk_op["error"]["errors"][0] log.error( f"bulkInsert operation error: {error['code']} operationName:'{bulk_op['name']}'" ) # Fetch all insert operations from all bulkInserts. Group by error code and log successful_inserts, failed_inserts = separate( lambda op: "error" in op, get_insert_operations(bulk_operations) ) # Apparently multiple errors are possible... so join with +. # grouped_inserts could be made into a dict, but it's not really necessary. Save some memory. grouped_inserts = util.groupby_unsorted( failed_inserts, lambda op: "+".join(err["code"] for err in op["error"]["errors"]), ) for code, failed_ops in grouped_inserts: # at least one insert failure failed_nodes = [parse_self_link(op["targetLink"]).instance for op in failed_ops] hostlist = util.to_hostlist(failed_nodes) count = len(failed_nodes) log.error( f"{count} instances failed to start due to insert operation error: {code} ({hostlist})" ) down_nodes(hostlist, code) if log.isEnabledFor(logging.DEBUG): msg = "\n".join( err["message"] for err in next(failed_ops)["error"]["errors"] ) log.debug(f"{code} message from first node: {msg}") # If reconfigure enabled, create subscriptions for successfully started instances if lkp.cfg.enable_reconfigure and len(successful_inserts): started_nodes = [ parse_self_link(op["targetLink"]).instance for op in successful_inserts ] count = len(started_nodes) hostlist = util.to_hostlist(started_nodes) log.info("create {} subscriptions ({})".format(count, hostlist)) execute_with_futures(subscription_create, nodes)