def convert_model(self, model: Snippet) -> BOW: bags = [self._uasts2bow(uast) for uast in model.uasts] data = list(zip(*[(bag_[x], i, x) for i, bag_ in enumerate(bags) for x in bag_])) matrix = csr_matrix((data[0], (data[1], data[2])), shape=(len(bags), len(self._uasts2bow.vocabulary)), dtype=numpy.float32) bow = BOW(log_level=logging.WARNING) bow.construct(repos=model.names, matrix=matrix, tokens=self._tokens) bow.meta["dependencies"] = [self._uasts2bow.docfreq] return bow
def convert_model(self, model: UASTModel) -> BOW: bag = self._uasts2bow(model.uasts) data = numpy.array(list(bag.values()), dtype=numpy.float32) indices = numpy.array(list(bag.keys()), dtype=numpy.int32) matrix = csr_matrix((data, indices, [0, len(data)]), shape=(1, len(self._uasts2bow.vocabulary))) bow = BOW(log_level=logging.WARNING) bow.construct(repos=[model.repository], matrix=matrix, tokens=self._tokens) bow.meta["dependencies"] = [self._uasts2bow.docfreq] return bow
def test_all(self): with tempfile.TemporaryDirectory( prefix="ast2vec-test-source2bow-") as tmpdir: args = argparse.Namespace(processes=2, input=paths.DATA_DIR_SOURCE, output=tmpdir, filter="**/source_*.asdf", vocabulary_size=500, docfreq=os.path.join( os.path.dirname(__file__), paths.DOCFREQ), overwrite_existing=True) source2bow_entry(args) for n, file in enumerate(os.listdir(tmpdir)): bow = BOW().load(os.path.join(tmpdir, file)) self.assertGreater(bow._matrix.getnnz(), 0) self.assertEqual(len(bow.repos), 1) self.assertEqual(n, 3)
def bow2vw_entry(args: argparse.Namespace): if not args.nbow: bow = BOW().load(source=args.bow) else: bow = NBOW.as_bow(args.nbow, args.id2vec) convert_bow_to_vw(bow, args.output)