def hash_file(args): if not args.feature: raise ValueError("extractors must not be empty") log = logging.getLogger("hash_file") vocab = OrderedDocumentFrequencies().load(args.docfreq) params = WeightedMinHashParameters().load(args.params) log.info("Extracting UAST from %s", args.file) uast = BblfshClient(args.bblfsh).parse(args.file).uast log.info("Populating the bag") extractors = [__extractors__[s]( args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args)) for s in args.feature] bag = numpy.zeros(len(vocab), dtype=numpy.float32) for ex in extractors: ex.ndocs = vocab.docs ex.docfreq = vocab for k, v in ex.extract(uast): try: i = vocab.order[k] bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs) except KeyError: continue log.info("Bag size: %d", len(bag.nonzero()[0])) log.info("Hashing") return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs, params.betas), bag
def hash_file(args): if not args.feature: raise ValueError("extractors must not be empty") log = logging.getLogger("hash_file") vocab = OrderedDocumentFrequencies().load(args.docfreq) params = WeightedMinHashParameters().load(args.params) log.info("Extracting UAST from %s", args.file) uast = BblfshClient(args.bblfsh).parse(args.file).uast log.info("Populating the bag") extractors = [ __extractors__[s](args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args)) for s in args.feature ] bag = numpy.zeros(len(vocab), dtype=numpy.float32) for ex in extractors: ex.ndocs = vocab.docs ex.docfreq = vocab for k, v in ex.extract(uast): try: i = vocab.order[k] bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs) except KeyError: continue log.info("Bag size: %d", len(bag.nonzero()[0])) log.info("Hashing") return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs, params.betas), bag
def __call__(self, head): c = self.Columns df = self.df return head \ .filter(lambda x: df.get(x[c.token]) is not None) \ .map(lambda x: Row(**{ c.token: x[c.token], c.document: x[c.document], c.value: log_tf_log_idf(df=df[x[c.token]], tf=x[c.value], ndocs=df.docs)}))
def test_call(self): baseline = { Row(d=dict(i)["d"], t=dict(i)["t"], v=log_tf_log_idf(dict(i)["v"], int(dict(i)["t"]), self.docs)) for i in tfidf_data.term_freq_result } result = self.tfidf( self.session.sparkContext .parallelize(tfidf_data.term_freq_result) .map(lambda x: Row(**dict(x)))).collect() self.assertEqual(set(result), baseline)
def __call__(self, head: RDD): """ :param head: pyspark rdd where each row is named tuple with `token`, `document` and `value` (term frequency) fields. One can use Uast2TermFreq Transformer to calculate such rdd. :return: rdd after applying TFIDF. """ c = self.Columns df = self.sc.broadcast(self.df) ndocs = self.ndocs head = head \ .filter(lambda x: df.value.get(x[c.token]) is not None) \ .map(lambda x: Row(**{ c.token: x[c.token], c.document: x[c.document], c.value: log_tf_log_idf(df=df.value[x[c.token]], tf=x[c.value], ndocs=ndocs)})) df.unpersist(blocking=True) return head