Ejemplo n.º 1
0
    def update_association(self, postfreq_sum, commentfreq_sum, vocab_pairsum,
                           vocab_ids, batch_size):
        qassociation, schema = self._query_all(
            self.query_association_by_vocabt_id, (tuple(vocab_ids), ))
        association_dict = {(i[schema['vocabt_id']], i[schema['vocabc_id']],
                             i[schema['tokenizer']]): i[schema['pxy']]
                            for i in qassociation}

        total_vocab_id = list(
            set(
                it.chain.from_iterable([[i[0], i[1]]
                                        for i in association_dict])))

        if len(total_vocab_id) > 0:
            qvocab, schema = self._query_all(self.query_vocab_by_id_sql,
                                             (tuple(total_vocab_id), ))
            qvocab_dict = {
                v[schema['id']]:
                (v[schema['postfreq']], v[schema['commentfreq']])
                for v in qvocab
            }

            vocabt_all = []
            vocabc_all = []
            npmi_all = []
            confidence_all = []
            tokenizer_all = []
            for k, v in association_dict.items():
                px = qvocab_dict[k[0]][0] / postfreq_sum
                py = qvocab_dict[k[1]][1] / commentfreq_sum
                pxy = v / vocab_pairsum

                vocabt_all.append(k[0])
                vocabc_all.append(k[1])
                npmi_all.append(self.normalized_pmi(px, py, pxy))
                confidence_all.append(math.log(pxy / px))
                tokenizer_all.append(k[2])

            batch_vocabt = self.batch_list(vocabt_all, batch_size)
            batch_vocabc = self.batch_list(vocabc_all, batch_size)
            batch_tokenizer = self.batch_list(tokenizer_all, batch_size)
            batch_npmi = self.batch_list(npmi_all, batch_size)
            batch_confidence = self.batch_list(confidence_all, batch_size)

            for vocabt_id, vocabc_id, tokenizer, confidence, pmi in zip(
                    batch_vocabt, batch_vocabc, batch_tokenizer,
                    batch_confidence, batch_npmi):
                psql = PsqlQuery()
                psql.update(
                    self.update_association_sql, {
                        'vocabt_id': vocabt_id,
                        'vocabc_id': vocabc_id,
                        'tokenizer': tokenizer,
                        'confidence': confidence,
                        'pmi': pmi
                    })
Ejemplo n.º 2
0
 def update_vocab_commentfreq(self, vocab_id):
     vocab_id = list(set(vocab_id))
     qvocab2comment, schema = self._query_all(
         self.query_vocab2comment_by_vid_sql, (tuple(vocab_id), ))
     qvocab_id = [v2c[schema['vocabulary_id']] for v2c in qvocab2comment]
     vocab_cnt = collections.Counter(qvocab_id)
     freq = [vocab_cnt[id_] if id_ in vocab_cnt else 0 for id_ in vocab_id]
     psql = PsqlQuery()
     psql.update(self.update_vocab_commentfreq_sql, {
         'id_': vocab_id,
         'commentfreq': freq
     })