コード例 #1
0
    def query_vocab_by_title_id(self, title_id):
        tid = list(set(title_id))

        psql = PsqlQuery()
        vocab2title, schema = psql.query_all(self.query_vocab2post_by_tid_sql,
                                             (tuple(tid), ))

        vocab_id = list({v2t[schema['vocabulary_id']] for v2t in vocab2title})
        vocab, vschema = psql.query_all(self.query_vocab_by_id,
                                        (tuple(vocab_id), ))

        return vocab, vschema
コード例 #2
0
    def query_vocab_by_post_id(self, post_id):
        pid = list(set(post_id))

        psql = PsqlQuery()
        vocab2post, schema = psql.query_all(self.query_vocab2post_by_pid_sql,
                                            (tuple(pid), ))

        vocab_id = list({v2p[schema['vocabulary_id']] for v2p in vocab2post})
        vocab, vschema = psql.query_all(self.query_vocab_by_id,
                                        (tuple(vocab_id), ))

        return vocab, vschema
コード例 #3
0
    def query_vocab_by_comment_id(self, comment_id):
        cmtid = list(set(comment_id))

        psql = PsqlQuery()
        vocab2comment, schema = psql.query_all(
            self.query_vocab2comment_by_cmtid_sql, (tuple(cmtid), ))

        vocab_id = list(
            {v2c[schema['vocabulary_id']]
             for v2c in vocab2comment})
        vocab, vschema = psql.query_all(self.query_vocab_by_id,
                                        (tuple(vocab_id), ))

        return vocab, vschema
コード例 #4
0
    def guery_vocab_group_by_comment_id(self, comment_id):
        psql = PsqlQuery()

        comment2vocab, schema = psql.query_all(
            self.guery_vocab_group_by_comment_id_sql, (tuple(comment_id), ))

        return comment2vocab, schema
コード例 #5
0
    def guery_vocab_group_by_title_id(self, title_id):
        psql = PsqlQuery()

        title2vocab, schema = psql.query_all(
            self.guery_vocab_group_by_title_id_sql, (tuple(title_id), ))

        return title2vocab, schema
コード例 #6
0
    def query_comment_by_post(self, post_id):
        bundle = [(id_, self.tokenizer_tag) for id_ in post_id]
        psql = PsqlQuery()
        comment, schema = psql.query_all(self.query_comment_by_unique_sql,
                                         (tuple(bundle), ))

        return comment, schema
コード例 #7
0
    def get_comment_obj(self, post_id):
        if not bool(post_id):
            return []

        # Bottleneck ?
        comments, cmtschema = self.query_comment_by_post(post_id)
        #

        cmtid = [cmt[cmtschema['id']] for cmt in comments]
        cmt2vocab, c2vschema = self.guery_vocab_group_by_comment_id(cmtid)

        vid = list({
            v
            for c2v in cmt2vocab for v in c2v[c2vschema['vocabulary_group']]
        })

        if not bool(cmtid):
            return []

        psql = PsqlQuery()
        cvocab, vschema = psql.query_all(self.query_vocab_by_id_sql,
                                         (tuple(vid), ))

        c2v_dict = {
            c2v[c2vschema['comment_id']]: c2v[c2vschema['vocabulary_group']]
            for c2v in cmt2vocab
        }

        v_dict = {v[vschema['id']]: v for v in cvocab}
        comment_objs = []
        for i, cmt in enumerate(comments):
            if cmt[cmtschema['id']] not in self.excluded_comment_ids:
                if cmt[cmtschema['id']] in c2v_dict:
                    vocabs = [
                        self._construct_vocab(v_dict[vid], vschema)
                        for vid in c2v_dict[cmt[cmtschema['id']]]
                    ]
                else:
                    vocabs = []

                comment_objs.append(
                    Comment(vocabs,
                            self.tokenizer_tag,
                            post_id=cmt[cmtschema['post_id']],
                            audience=cmt[cmtschema['audience_id']],
                            quality=cmt[cmtschema['quality']],
                            ctype=cmt[cmtschema['ctype']],
                            retrieval_count=cmt[cmtschema['retrieval_count']],
                            floor=cmt[cmtschema['floor']],
                            id_=cmt[cmtschema['id']],
                            body=''.join(cmt[cmtschema['tokenized']].split())))

            if i > self.max_query_comment_num:
                break

        return comment_objs
コード例 #8
0
    def get_title_obj(self, vocab_id):
        if not bool(vocab_id):
            return []

        # Bottleneck ?
        v2t, v2tschema = self.query_vocab2title(vocab_id)
        fltr_tid = [
            q[v2tschema['title_id']] for q in v2t
            if q[v2tschema['title_id']] not in self.excluded_title_ids
        ]
        #

        title2vocab, t2vschema = self.guery_vocab_group_by_title_id(fltr_tid)

        tid = list({t2v[t2vschema['title_id']] for t2v in title2vocab})
        vid = list({
            v
            for t2v in title2vocab for v in t2v[t2vschema['vocabulary_group']]
        })

        if not bool(tid):
            return []

        title_generator, tschema = self.query_title_by_id(tid)

        psql = PsqlQuery()
        tvocab, vschema = psql.query_all(self.query_vocab_by_id_sql,
                                         (tuple(vid), ))

        t2v_dict = {
            t2v[t2vschema['title_id']]: t2v[t2vschema['vocabulary_group']]
            for t2v in title2vocab
        }
        v_dict = {v[vschema['id']]: v for v in tvocab}
        title_objs = []
        for i, tt in enumerate(title_generator):
            if tt[tschema['post_id']] not in self.excluded_post_ids:
                vocabs = [
                    self._construct_vocab(v_dict[vid], vschema)
                    for vid in t2v_dict[tt[tschema['id']]]
                ]

                title_objs.append(
                    Title(vocabs,
                          self.tokenizer_tag,
                          post_id=tt[tschema['post_id']],
                          quality=tt[tschema['quality']],
                          ctype=tt[tschema['quality']],
                          retrieval_count=tt[tschema['quality']],
                          body=''.join(tt[tschema['tokenized']].split()),
                          id_=tt[tschema['id']]))
            if i >= self.max_query_title_num:
                break

        return title_objs
コード例 #9
0
    def guery_vocab_group_by_title_using_vocab(self, vocab_id, ex_title_id):
        psql = PsqlQuery()

        if not bool(ex_title_id):
            ex_title_id = [-1]

        title2vocab, schema = psql.query_all(
            self.guery_vocab_group_by_title_using_vocab_id_sql, {
                'vid': tuple(vocab_id),
                'tid': tuple(ex_title_id)
            })

        return title2vocab, schema
コード例 #10
0
    def query_vocab_by_words(self, wds, relative_words=None):
        words = list(wds)
        if bool(relative_words):
            try:
                words += list(relative_words)
            except Exception as err:
                self.logger.warning(err)

        bundle = [(w.word, w.pos, self.tokenizer_tag) for w in words]

        psql = PsqlQuery()
        qvocab, vschema = psql.query_all(self.query_vocab_sql,
                                         (tuple(bundle), ))

        return qvocab, vschema
コード例 #11
0
    def query_vocab2post(self, vocab_id):
        psql = PsqlQuery()
        vocab2post, schema = psql.query_all(self.query_vocab2post_by_vid_sql,
                                            (tuple(vocab_id), ))

        return [v2p[schema['post_id']] for v2p in vocab2post]
コード例 #12
0
 def query_comment_quality_by_id(self, comment_id):
     psql = PsqlQuery()
     vocab, vschema = psql.query_all(self.query_comment_quality_by_id_sql,
                                     (comment_id, ))
     return vocab, vschema
コード例 #13
0
 def query_title_quality_by_id(self, title_id):
     psql = PsqlQuery()
     vocab, vschema = psql.query_all(self.query_title_quality_by_id_sql,
                                     (title_id, ))
     return vocab, vschema
コード例 #14
0
 def query_vocab_quality_by_id(self, vocab_word):
     psql = PsqlQuery()
     vocab, vschema = psql.query_all(self.query_vocab_quality_by_word_sql,
                                     (vocab_word, ))
     return vocab, vschema
コード例 #15
0
 def _query_all(self, sql_string, data=None):
     psql = PsqlQuery()
     fetched, schema = psql.query_all(sql_string, data)
     return fetched, schema
コード例 #16
0
ファイル: evaluate.py プロジェクト: ifengc/marginalbear
        return [v for v in cmt.vocabs]

    return [extract(cmt) for cmt in comments]


if __name__ == '__main__':
    with open('eval0829.csv', 'w') as f:
        f.write('random, base, pweight\n')
    psql = PsqlQuery()
    posts = psql.query(query_post_sql)
    pschema = psql.schema

    valid_post = 0

    for idx, p in enumerate(posts):
        titles, tschema = psql.query_all(
            query_title_sql, dict(pid=p[pschema['id']], tok='jieba'))

        basic_retriever = RetrievalEvaluate(
            'jieba',
            excluded_post_ids=[p[pschema['id']]],
            logger_name='retrieve')

        pweight_retriever = RetrievalEvaluate(
            'jieba',
            excluded_post_ids=[p[pschema['id']]],
            pweight=JiebaPosWeight.weight,
            logger_name='retrieve')

        query = ' '.join([
            '{}:{}'.format(w, p)
            for w, p in zip(titles[0][tschema['tokenized']].split(), titles[0][
コード例 #17
0
    def query_vocab2title(self, vocab_id):
        psql = PsqlQuery()
        vocab2post, schema = psql.query_all(self.query_vocab2title_by_vid_sql,
                                            (tuple(vocab_id), ))

        return vocab2post, schema