Esempio n. 1
0
def hash_file(args):
    if not args.feature:
        raise ValueError("extractors must not be empty")
    log = logging.getLogger("hash_file")
    vocab = OrderedDocumentFrequencies().load(args.docfreq)
    params = WeightedMinHashParameters().load(args.params)
    log.info("Extracting UAST from %s", args.file)
    uast = BblfshClient(args.bblfsh).parse(args.file).uast
    log.info("Populating the bag")
    extractors = [__extractors__[s](
        args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args))
        for s in args.feature]
    bag = numpy.zeros(len(vocab), dtype=numpy.float32)
    for ex in extractors:
        ex.ndocs = vocab.docs
        ex.docfreq = vocab
        for k, v in ex.extract(uast):
            try:
                i = vocab.order[k]
                bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs)
            except KeyError:
                continue

    log.info("Bag size: %d", len(bag.nonzero()[0]))
    log.info("Hashing")

    return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs, params.betas), bag
Esempio n. 2
0
def hash_file(args):
    if not args.feature:
        raise ValueError("extractors must not be empty")
    log = logging.getLogger("hash_file")
    vocab = OrderedDocumentFrequencies().load(args.docfreq)
    params = WeightedMinHashParameters().load(args.params)
    log.info("Extracting UAST from %s", args.file)
    uast = BblfshClient(args.bblfsh).parse(args.file).uast
    log.info("Populating the bag")
    extractors = [
        __extractors__[s](args.min_docfreq,
                          **__extractors__[s].get_kwargs_fromcmdline(args))
        for s in args.feature
    ]
    bag = numpy.zeros(len(vocab), dtype=numpy.float32)
    for ex in extractors:
        ex.ndocs = vocab.docs
        ex.docfreq = vocab
        for k, v in ex.extract(uast):
            try:
                i = vocab.order[k]
                bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs)
            except KeyError:
                continue

    log.info("Bag size: %d", len(bag.nonzero()[0]))
    log.info("Hashing")

    return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs,
                            params.betas), bag
Esempio n. 3
0
 def __call__(self, head):
     c = self.Columns
     df = self.df
     return head \
         .filter(lambda x: df.get(x[c.token]) is not None) \
         .map(lambda x: Row(**{
             c.token: x[c.token],
             c.document: x[c.document],
             c.value: log_tf_log_idf(df=df[x[c.token]], tf=x[c.value], ndocs=df.docs)}))
Esempio n. 4
0
    def test_call(self):
        baseline = {
            Row(d=dict(i)["d"], t=dict(i)["t"],
                v=log_tf_log_idf(dict(i)["v"], int(dict(i)["t"]), self.docs))
            for i in tfidf_data.term_freq_result
        }

        result = self.tfidf(
            self.session.sparkContext
                .parallelize(tfidf_data.term_freq_result)
                .map(lambda x: Row(**dict(x)))).collect()
        self.assertEqual(set(result), baseline)
Esempio n. 5
0
    def __call__(self, head: RDD):
        """

        :param head: pyspark rdd where each row is named tuple with `token`, `document` and `value`
                   (term frequency) fields. One can use Uast2TermFreq Transformer to calculate
                   such rdd.
        :return: rdd after applying TFIDF.
        """
        c = self.Columns
        df = self.sc.broadcast(self.df)
        ndocs = self.ndocs
        head = head \
            .filter(lambda x: df.value.get(x[c.token]) is not None) \
            .map(lambda x: Row(**{
                c.token: x[c.token],
                c.document: x[c.document],
                c.value: log_tf_log_idf(df=df.value[x[c.token]], tf=x[c.value], ndocs=ndocs)}))
        df.unpersist(blocking=True)
        return head