def check_coocc(self, output, copies_number=COPIES_NUMBER): coocc = Cooccurrences().load(models.COOCC) res = Cooccurrences().load(output) self.assertEqual(len(res.tokens), len(coocc.tokens)) permutation = [coocc.tokens.index(token) for token in res.tokens] self.assertTrue( numpy.all(res.matrix.todense() == copies_number * coocc.matrix.todense()[permutation][:, permutation]))
def test_overflow_with_spark(self): with tempfile.TemporaryDirectory( prefix="merge-coocc-entry-test") as input_dir: self.copy_models(models.COOCC, input_dir, COPIES_NUMBER) args = get_args(input_dir, False) c_neg = Cooccurrences().load(args.input[0]) c_neg.matrix.data[0] = MAX_INT32 - c_neg.matrix.data[0] c_neg.save(args.input[0]) merge_coocc(args) result = Cooccurrences().load(args.output) self.assertTrue(numpy.all(result.matrix.data <= MAX_INT32)) self.assertTrue(numpy.all(result.matrix.data >= 0))
def merge_coocc_no_spark(df, filepaths, log, args): """ Algorithm explanation: 1. Although we store result in uint32, we actually never have elements greater than MAX_INT32 2. We assume that both result and the summed matrix do not have elements greater than MAX_INT32 3. As soon as we have a value bigger than MAX_INT32 after summing, we saturate 4. Thus we lose 2x data range but do not allocate any additional memory and it works faster than MAX_UINT32 checks 5. Only ? elements saturate in PGA so this is fine """ # TODO(zurk): recheck the number of saturated elements. log.info("Merging cooccurrences without using PySpark") shape = (len(df) + 1, ) * 2 result = coo_matrix(shape, dtype=np.uint32) for path, coocc in load_and_check(filepaths, log): coocc._matrix = coo_matrix(coocc._matrix) index = [df.order.get(x, len(df)) for x in coocc.tokens] rows = [index[x] for x in coocc.matrix.row] cols = [index[x] for x in coocc.matrix.col] result += coo_matrix((coocc.matrix.data, (rows, cols)), shape=shape, dtype=np.uint32) indx_overflow = np.where(result.data > MAX_INT32) if indx_overflow[0].size > 0: log.warning( "Overflow in %d elements. They will be saturated to MAX_INT32", indx_overflow[0].size) result.data[indx_overflow] = MAX_INT32 Cooccurrences() \ .construct(df.tokens(), result[:-1, :-1]) \ .save(args.output, (df,))
def test_load_and_check(self): with tempfile.TemporaryDirectory( prefix="merge-coocc-entry-test") as input_dir: self.copy_models(models.COOCC, input_dir, COPIES_NUMBER) args = get_args(input_dir, True) c_neg = Cooccurrences().load(args.input[0]) c_neg.matrix.data[0] = -1 c_neg.save(args.input[0]) self.assertEqual( len(list(load_and_check(args.input, logging.getLogger("test")))), 2) c_neg = Cooccurrences().load(args.input[0]) c_neg.matrix.data = numpy.uint32(c_neg.matrix.data) c_neg.matrix.data[0] = MAX_INT32 + 1 c_neg.save(args.input[0]) for _, coocc in load_and_check(args.input, logging.getLogger("test")): self.assertTrue(numpy.all(coocc.matrix.data <= MAX_INT32)) break
def load_and_check(filepaths: list, log: logging.Logger): """ Load Cooccurrences models from filepaths list and perform simple check: 1. If model contains values more than MAX_INT32 we saturate. 2. If model contains negative values we consider it as corrupted, report and skip. """ for path in progress_bar(filepaths, log): coocc = Cooccurrences().load(path) negative_values = np.where(coocc.matrix.data < 0) if negative_values[0].size > 0: log.warning("Model %s is corrupted and will be skipped. " "It contains negative elements.", path) continue too_big_values = np.where(coocc.matrix.data > MAX_INT32) if too_big_values[0].size > 0: log.warning("Model %s contains elements with values more than MAX_INT32. " "They will be saturated to MAX_INT32", path) coocc.matrix.data[too_big_values] = MAX_INT32 yield path, coocc
def __call__(self, sparse_matrix: PipelinedRDD): """ Saves Cooccurrences asdf model to disk. :param sparse_matrix: rdd with 3 columns: matrix row, matrix column, cell value. Use :class:`.CooccConstructor` to construct RDD from uasts. :return: """ rows = sparse_matrix.collect() mat_row, mat_col, mat_weights = zip(*rows) tokens_num = len(self.tokens_list) self._log.info("Building matrix...") matrix = sparse.coo_matrix((mat_weights, (mat_row, mat_col)), shape=(tokens_num, tokens_num)) Cooccurrences() \ .construct(self.tokens_list, matrix) \ .save(self.output, deps=(self.df_model,))
def id2vec_preprocess(args): """ Loads co-occurrence matrices for several repositories and generates the document frequencies and the Swivel protobuf dataset. :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \ "shard_size", "df" and "output". :return: None """ log = logging.getLogger("preproc") log.info("Loading docfreq model from %s", args.docfreq_in) df_model = DocumentFrequencies(log_level=args.log_level).load( source=args.docfreq_in) coocc_model = Cooccurrences().load(args.input) if numpy.any(coocc_model.matrix.data < 0): raise ValueError( ("Co-occurrence matrix %s contains negative elements. " "Please check its correctness.") % args.input) if numpy.any(numpy.isnan(coocc_model.matrix.data)): raise ValueError(("Co-occurrence matrix %s contains nan elements. " "Please check its correctness.") % args.input) try: df_meta = coocc_model.get_dep(DocumentFrequencies.NAME) if df_model.meta != df_meta: raise ValueError(( "Document frequency model you provided does not match dependency inside " "Cooccurrences model:\nargs.docfreq.meta:\n%s\ncoocc_model.get_dep" "(\"docfreq\")\n%s\n") % (df_model.meta, df_meta)) except KeyError: pass # There is no docfreq dependency vs = args.vocabulary_size if len(df_model) < vs: vs = len(df_model) sz = args.shard_size if vs < sz: raise ValueError( "vocabulary_size=%s is less than shard_size=%s. You should specify a smaller " "shard_size (e.g. shard_size=%s)." % (vs, sz, vs)) vs -= vs % sz log.info("Effective vocabulary size: %d", vs) df_model = df_model.greatest(vs) log.info("Sorting the vocabulary...") chosen_words = sorted(df_model.tokens()) word_indices = {w: i for i, w in enumerate(chosen_words)} if not os.path.exists(args.output): os.makedirs(args.output) with open(os.path.join(args.output, "row_vocab.txt"), "w") as out: out.write('\n'.join(chosen_words)) log.info("Saved row_vocab.txt") shutil.copyfile(os.path.join(args.output, "row_vocab.txt"), os.path.join(args.output, "col_vocab.txt")) log.info("Saved col_vocab.txt") del chosen_words ccmatrix = extract_coocc_matrix((vs, vs), word_indices, coocc_model) log.info("Planning the sharding...") bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1] reorder = numpy.argsort(-bool_sums) with open(os.path.join(args.output, "row_sums.txt"), "w") as out: out.write('\n'.join(map(str, bool_sums.tolist()))) log.info("Saved row_sums.txt") shutil.copyfile(os.path.join(args.output, "row_sums.txt"), os.path.join(args.output, "col_sums.txt")) log.info("Saved col_sums.txt") log.info("Writing the shards...") os.makedirs(args.output, exist_ok=True) nshards = vs // args.shard_size for row in progress_bar(range(nshards), log, expected_size=nshards): for col in range(nshards): indices_row = reorder[row::nshards] indices_col = reorder[col::nshards] shard = ccmatrix[indices_row][:, indices_col].tocoo() example = tf.train.Example(features=tf.train.Features( feature={ "global_row": _int64s(indices_row), "global_col": _int64s(indices_col), "sparse_local_row": _int64s(shard.row), "sparse_local_col": _int64s(shard.col), "sparse_value": _floats(shard.data) })) with open( os.path.join(args.output, "shard-%03d-%03d.pb" % (row, col)), "wb") as out: out.write(example.SerializeToString()) log.info("Success")
def setUp(self): self.model = Cooccurrences().load(source=paths.COOCC)