def upsert_post(batch_post):
    timestamp = [post['timestamp'] for post in batch_post]
    _title = [post['title'] for post in batch_post]
    _url = [post['url'] for post in batch_post]
    _author = [post['author'] for post in batch_post]
    _content = [post['content'] for post in batch_post]
    _comment = [post['comment'] for post in batch_post]
    comment_len = [len(cmt) for cmt in _comment]
    sorted_idx = np.argsort(comment_len)[::-1]
    title, url, author, content, comment, publish_date = [], [], [], [], [], []

    for idx in sorted_idx:
        if _url[idx] not in url:
            url.append(_url[idx])
            title.append(_title[idx])
            author.append(_author[idx])
            content.append(_content[idx])
            comment.append(_comment[idx])
            publish_date.append(datetime.fromtimestamp(timestamp[idx]))

    post_id = []
    try:
        psql = PsqlQuery()
        post_id = psql.upsert(upsert_post_sql, locals())
    except Exception as e:
        oklogger.logger.error(e)
        oklogger.logger.error(title)
        raise e

    return post_id
Beispiel #2
0
    def guery_vocab_group_by_comment_id(self, comment_id):
        psql = PsqlQuery()

        comment2vocab, schema = psql.query_all(
            self.guery_vocab_group_by_comment_id_sql, (tuple(comment_id), ))

        return comment2vocab, schema
Beispiel #3
0
    def upsert_vocab2post(self,
                          batch_post,
                          post_id,
                          vocab_bundle,
                          vschema,
                          tokenized_field='title_tokenized'):
        tokenized = [[(k['word'], k['pos'], self.tokenizer)
                      for k in p[tokenized_field]] for p in batch_post]

        vocab2post = []
        for vocab in vocab_bundle:
            vtubple = (vocab[vschema['word']], vocab[vschema['pos']],
                       vocab[vschema['tokenizer']])
            post_id_with_vocab = [
                p for idx, p in enumerate(post_id) if vtubple in tokenized[idx]
            ]
            vocab2post.append([(vocab[vschema['id']], pid)
                               for pid in post_id_with_vocab])

        flatten_vocab2post = [tup for v2p in vocab2post for tup in v2p]

        vocabulary_id = [v2p[0] for v2p in flatten_vocab2post]
        flatten_post_id = [v2p[1] for v2p in flatten_vocab2post]

        psql = PsqlQuery()
        psql.upsert(self.upsert_vocab2post_sql, {
            'vocabulary_id': vocabulary_id,
            'post_id': flatten_post_id
        })

        return vocabulary_id
Beispiel #4
0
    def guery_vocab_group_by_title_id(self, title_id):
        psql = PsqlQuery()

        title2vocab, schema = psql.query_all(
            self.guery_vocab_group_by_title_id_sql, (tuple(title_id), ))

        return title2vocab, schema
Beispiel #5
0
    def query_comment_by_post(self, post_id):
        bundle = [(id_, self.tokenizer_tag) for id_ in post_id]
        psql = PsqlQuery()
        comment, schema = psql.query_all(self.query_comment_by_unique_sql,
                                         (tuple(bundle), ))

        return comment, schema
Beispiel #6
0
    def upsert_vocab2comment(self,
                             batch_comment,
                             comment_id,
                             vocab_bundle,
                             vschema,
                             tokenized_field='comment_tokenized'):

        tokenized = [[(k['word'], k['pos'], self.tokenizer)
                      for k in p[tokenized_field]] for p in batch_comment]
        vocab2comment = []

        for vocab in vocab_bundle:
            vtuple = (vocab[vschema['word']], vocab[vschema['pos']],
                      vocab[vschema['tokenizer']])
            comment_id_with_vocab = [
                cmt for idx, cmt in enumerate(comment_id)
                if vtuple in tokenized[idx]
            ]
            vocab2comment.append([(vocab[vschema['id']], cid)
                                  for cid in comment_id_with_vocab])

        flatten_vocab2cmt = [tup for v2c in vocab2comment for tup in v2c]

        vocabulary_id = [v2c[0] for v2c in flatten_vocab2cmt]
        cmt_id = [v2c[1] for v2c in flatten_vocab2cmt]

        psql = PsqlQuery()
        psql.upsert(self.upsert_vocab2comment_sql, {
            'vocabulary_id': vocabulary_id,
            'comment_id': cmt_id
        })

        return vocabulary_id
Beispiel #7
0
    def update_title_quality(self, id_to_update, quality):
        psql = PsqlQuery()

        return psql.upsert(self.update_title_quality_sql, {
            'id_': id_to_update,
            'quality': quality
        })
Beispiel #8
0
    def insert_title(self,
                     batch_post,
                     post_id,
                     tokenized_field='title_tokenized',
                     type_field='ctype'):

        num = len(batch_post)
        # qpost, pschema = self.query_post(post_url)
        tokenized = [
            ' '.join([k['word'] for k in p[tokenized_field]])
            for p in batch_post
        ]
        grammar = [
            ' '.join([k['pos'] for k in p[tokenized_field]])
            for p in batch_post
        ]
        # post_id = [p[pschema['id']] for p in post_bundle]
        ctype = [p[type_field] for p in batch_post]
        tokenizer = [self.tokenizer] * num
        retrieval_count = [0] * num
        quality = [0.0 for _ in range(num)]

        psql = PsqlQuery()
        title_id = psql.upsert(self.insert_title_sql, locals())

        return [t[0] for t in title_id]
Beispiel #9
0
def draw_title():
    psql = PsqlQuery()
    gtitle = psql.query(draw_random_title_sql)
    schema = psql.schema
    title = []
    for t in gtitle:
        title.append(t)
    return title[0], schema
Beispiel #10
0
    def query_comment_by_id(self, comment_id):
        psql = PsqlQuery()
        comment = psql.query(self.query_comment_by_id_sql, {
            'id_': tuple(comment_id),
            'tok': self.tokenizer_tag
        })
        schema = psql.schema

        return comment, schema
Beispiel #11
0
    def query_title_by_id(self, title_id):
        psql = PsqlQuery()
        title = psql.query(self.query_title_by_id_sql, {
            'id_': tuple(title_id),
            'tok': self.tokenizer_tag
        })
        schema = psql.schema

        return title, schema
Beispiel #12
0
 def insert_netizen(self, raw_name):
     name = list(set(raw_name))
     num = len(name)
     quality = [0.0 for _ in range(num)]
     posts = [0 for _ in range(num)]
     comments = [0 for _ in range(num)]
     psql = PsqlQuery()
     ids = psql.upsert(self.insert_netizen_sql, locals())
     return [i[0] for i in ids]
Beispiel #13
0
    def update_association(self, postfreq_sum, commentfreq_sum, vocab_pairsum,
                           vocab_ids, batch_size):
        qassociation, schema = self._query_all(
            self.query_association_by_vocabt_id, (tuple(vocab_ids), ))
        association_dict = {(i[schema['vocabt_id']], i[schema['vocabc_id']],
                             i[schema['tokenizer']]): i[schema['pxy']]
                            for i in qassociation}

        total_vocab_id = list(
            set(
                it.chain.from_iterable([[i[0], i[1]]
                                        for i in association_dict])))

        if len(total_vocab_id) > 0:
            qvocab, schema = self._query_all(self.query_vocab_by_id_sql,
                                             (tuple(total_vocab_id), ))
            qvocab_dict = {
                v[schema['id']]:
                (v[schema['postfreq']], v[schema['commentfreq']])
                for v in qvocab
            }

            vocabt_all = []
            vocabc_all = []
            npmi_all = []
            confidence_all = []
            tokenizer_all = []
            for k, v in association_dict.items():
                px = qvocab_dict[k[0]][0] / postfreq_sum
                py = qvocab_dict[k[1]][1] / commentfreq_sum
                pxy = v / vocab_pairsum

                vocabt_all.append(k[0])
                vocabc_all.append(k[1])
                npmi_all.append(self.normalized_pmi(px, py, pxy))
                confidence_all.append(math.log(pxy / px))
                tokenizer_all.append(k[2])

            batch_vocabt = self.batch_list(vocabt_all, batch_size)
            batch_vocabc = self.batch_list(vocabc_all, batch_size)
            batch_tokenizer = self.batch_list(tokenizer_all, batch_size)
            batch_npmi = self.batch_list(npmi_all, batch_size)
            batch_confidence = self.batch_list(confidence_all, batch_size)

            for vocabt_id, vocabc_id, tokenizer, confidence, pmi in zip(
                    batch_vocabt, batch_vocabc, batch_tokenizer,
                    batch_confidence, batch_npmi):
                psql = PsqlQuery()
                psql.update(
                    self.update_association_sql, {
                        'vocabt_id': vocabt_id,
                        'vocabc_id': vocabc_id,
                        'tokenizer': tokenizer,
                        'confidence': confidence,
                        'pmi': pmi
                    })
Beispiel #14
0
    def get_comment_obj(self, post_id):
        if not bool(post_id):
            return []

        # Bottleneck ?
        comments, cmtschema = self.query_comment_by_post(post_id)
        #

        cmtid = [cmt[cmtschema['id']] for cmt in comments]
        cmt2vocab, c2vschema = self.guery_vocab_group_by_comment_id(cmtid)

        vid = list({
            v
            for c2v in cmt2vocab for v in c2v[c2vschema['vocabulary_group']]
        })

        if not bool(cmtid):
            return []

        psql = PsqlQuery()
        cvocab, vschema = psql.query_all(self.query_vocab_by_id_sql,
                                         (tuple(vid), ))

        c2v_dict = {
            c2v[c2vschema['comment_id']]: c2v[c2vschema['vocabulary_group']]
            for c2v in cmt2vocab
        }

        v_dict = {v[vschema['id']]: v for v in cvocab}
        comment_objs = []
        for i, cmt in enumerate(comments):
            if cmt[cmtschema['id']] not in self.excluded_comment_ids:
                if cmt[cmtschema['id']] in c2v_dict:
                    vocabs = [
                        self._construct_vocab(v_dict[vid], vschema)
                        for vid in c2v_dict[cmt[cmtschema['id']]]
                    ]
                else:
                    vocabs = []

                comment_objs.append(
                    Comment(vocabs,
                            self.tokenizer_tag,
                            post_id=cmt[cmtschema['post_id']],
                            audience=cmt[cmtschema['audience_id']],
                            quality=cmt[cmtschema['quality']],
                            ctype=cmt[cmtschema['ctype']],
                            retrieval_count=cmt[cmtschema['retrieval_count']],
                            floor=cmt[cmtschema['floor']],
                            id_=cmt[cmtschema['id']],
                            body=''.join(cmt[cmtschema['tokenized']].split())))

            if i > self.max_query_comment_num:
                break

        return comment_objs
Beispiel #15
0
    def get_title_obj(self, vocab_id):
        if not bool(vocab_id):
            return []

        # Bottleneck ?
        v2t, v2tschema = self.query_vocab2title(vocab_id)
        fltr_tid = [
            q[v2tschema['title_id']] for q in v2t
            if q[v2tschema['title_id']] not in self.excluded_title_ids
        ]
        #

        title2vocab, t2vschema = self.guery_vocab_group_by_title_id(fltr_tid)

        tid = list({t2v[t2vschema['title_id']] for t2v in title2vocab})
        vid = list({
            v
            for t2v in title2vocab for v in t2v[t2vschema['vocabulary_group']]
        })

        if not bool(tid):
            return []

        title_generator, tschema = self.query_title_by_id(tid)

        psql = PsqlQuery()
        tvocab, vschema = psql.query_all(self.query_vocab_by_id_sql,
                                         (tuple(vid), ))

        t2v_dict = {
            t2v[t2vschema['title_id']]: t2v[t2vschema['vocabulary_group']]
            for t2v in title2vocab
        }
        v_dict = {v[vschema['id']]: v for v in tvocab}
        title_objs = []
        for i, tt in enumerate(title_generator):
            if tt[tschema['post_id']] not in self.excluded_post_ids:
                vocabs = [
                    self._construct_vocab(v_dict[vid], vschema)
                    for vid in t2v_dict[tt[tschema['id']]]
                ]

                title_objs.append(
                    Title(vocabs,
                          self.tokenizer_tag,
                          post_id=tt[tschema['post_id']],
                          quality=tt[tschema['quality']],
                          ctype=tt[tschema['quality']],
                          retrieval_count=tt[tschema['quality']],
                          body=''.join(tt[tschema['tokenized']].split()),
                          id_=tt[tschema['id']]))
            if i >= self.max_query_title_num:
                break

        return title_objs
Beispiel #16
0
    def query_vocab_by_title_id(self, title_id):
        tid = list(set(title_id))

        psql = PsqlQuery()
        vocab2title, schema = psql.query_all(self.query_vocab2post_by_tid_sql,
                                             (tuple(tid), ))

        vocab_id = list({v2t[schema['vocabulary_id']] for v2t in vocab2title})
        vocab, vschema = psql.query_all(self.query_vocab_by_id,
                                        (tuple(vocab_id), ))

        return vocab, vschema
Beispiel #17
0
    def query_vocab_by_post_id(self, post_id):
        pid = list(set(post_id))

        psql = PsqlQuery()
        vocab2post, schema = psql.query_all(self.query_vocab2post_by_pid_sql,
                                            (tuple(pid), ))

        vocab_id = list({v2p[schema['vocabulary_id']] for v2p in vocab2post})
        vocab, vschema = psql.query_all(self.query_vocab_by_id,
                                        (tuple(vocab_id), ))

        return vocab, vschema
Beispiel #18
0
 def update_vocab_commentfreq(self, vocab_id):
     vocab_id = list(set(vocab_id))
     qvocab2comment, schema = self._query_all(
         self.query_vocab2comment_by_vid_sql, (tuple(vocab_id), ))
     qvocab_id = [v2c[schema['vocabulary_id']] for v2c in qvocab2comment]
     vocab_cnt = collections.Counter(qvocab_id)
     freq = [vocab_cnt[id_] if id_ in vocab_cnt else 0 for id_ in vocab_id]
     psql = PsqlQuery()
     psql.update(self.update_vocab_commentfreq_sql, {
         'id_': vocab_id,
         'commentfreq': freq
     })
Beispiel #19
0
    def insert_comment(self,
                       comments,
                       batch_field='comments',
                       url_field='url',
                       tokenized_field='comment_tokenized',
                       type_field='ctype',
                       floor_field='floor',
                       audience_field='audience'):

        batch_comment = []
        for batch in comments:
            batch_comment.extend(batch[batch_field])

        post_url = [batch['url'] for batch in comments]
        if len(post_url) != len(set(post_url)):
            raise LengthNotMatchException

        num = len(batch_comment)
        qpost, pschema = self.query_post(post_url)

        tokenized = [
            ' '.join([k['word'] for k in cmt[tokenized_field]])
            for cmt in batch_comment
        ]
        grammar = [
            ' '.join([k['pos'] for k in cmt[tokenized_field]])
            for cmt in batch_comment
        ]

        ctype = [cmt[type_field] for cmt in batch_comment]
        floor = [cmt[floor_field] for cmt in batch_comment]

        audience = [cmt[audience_field] for cmt in batch_comment]
        self.insert_netizen(audience)

        tokenizer = [self.tokenizer] * num
        retrieval_count = [0] * num
        quality = [0.0 for _ in range(num)]

        post_id = []
        try:
            for idx, (batch, p) in enumerate(zip(comments, qpost)):
                post_id.extend([p[pschema['id']]] * len(batch[batch_field]))
        except Exception as err:
            self.logger.error(
                'It\'s impossible to insert Comments while Post doesn\'t exist. url: {}'
                .format(post_url[idx]))
            raise err

        psql = PsqlQuery()
        comment_id = psql.upsert(self.insert_comment_sql, locals())

        return [cmt[0] for cmt in comment_id], batch_comment
Beispiel #20
0
    def guery_vocab_group_by_title_using_vocab(self, vocab_id, ex_title_id):
        psql = PsqlQuery()

        if not bool(ex_title_id):
            ex_title_id = [-1]

        title2vocab, schema = psql.query_all(
            self.guery_vocab_group_by_title_using_vocab_id_sql, {
                'vid': tuple(vocab_id),
                'tid': tuple(ex_title_id)
            })

        return title2vocab, schema
def query_vocab_id(batch_size=1000):
    sql = 'SELECT id FROM pttcorpus_vocabulary;'
    psql = PsqlQuery()
    vocabs = psql.query(sql)
    batch = []
    i = 0
    for v in vocabs:
        batch.append(v[0])
        i += 1
        if i > batch_size:
            i = 0
            yield batch
            batch = []
    yield batch
Beispiel #22
0
    def query_vocab_by_comment_id(self, comment_id):
        cmtid = list(set(comment_id))

        psql = PsqlQuery()
        vocab2comment, schema = psql.query_all(
            self.query_vocab2comment_by_cmtid_sql, (tuple(cmtid), ))

        vocab_id = list(
            {v2c[schema['vocabulary_id']]
             for v2c in vocab2comment})
        vocab, vschema = psql.query_all(self.query_vocab_by_id,
                                        (tuple(vocab_id), ))

        return vocab, vschema
Beispiel #23
0
    def upsert_post(self,
                    batch_post,
                    title_raw_field='title_raw',
                    title_cleaned_field='title_cleaned',
                    comment_raw_field='comment_raw',
                    comment_cleaned_field='comment_cleaned',
                    tag_field='tag',
                    url_field='url',
                    spider_field='spider',
                    author_field='author',
                    publish_date_field='date'):

        post_num = len(batch_post)

        title_raw = [p[title_raw_field] for p in batch_post]
        title_cleaned = [p[title_cleaned_field] for p in batch_post]
        comment_raw = [p[comment_raw_field] for p in batch_post]
        comment_cleaned = [p[comment_cleaned_field] for p in batch_post]
        url = [p[url_field] for p in batch_post]
        if len(url) != len(set(url)):
            raise LengthNotMatchException

        tag = [p[tag_field] for p in batch_post]
        author = [
            p[author_field][:p[author_field].find('(')].strip()
            for p in batch_post
        ]
        self.insert_netizen(author)
        publish_date = [p[publish_date_field] for p in batch_post]
        spider = [p[spider_field] for p in batch_post]
        last_update = [datetime.now()] * post_num
        quality = [0.0 for _ in range(post_num)]
        update_count = [1] * post_num
        allow_update = [True] * post_num

        # qpost, schema = self.query_post(url)
        # for i, q in enumerate(qpost):
        #     if q:
        #         if len(q[schema['push']]) == len(push[i]):
        #             allow_update[i] = False
        post_id = []
        try:
            psql = PsqlQuery()
            post_id = psql.upsert(self.upsert_post_sql, locals())
        except Exception as e:
            self.logger.error(e)
            raise e

        return [p[0] for p in post_id]
Beispiel #24
0
    def query_vocab_by_words(self, wds, relative_words=None):
        words = list(wds)
        if bool(relative_words):
            try:
                words += list(relative_words)
            except Exception as err:
                self.logger.warning(err)

        bundle = [(w.word, w.pos, self.tokenizer_tag) for w in words]

        psql = PsqlQuery()
        qvocab, vschema = psql.query_all(self.query_vocab_sql,
                                         (tuple(bundle), ))

        return qvocab, vschema
Beispiel #25
0
    def insert_vocab_ignore_docfreq(self, words):

        distinct = list({(w.word, w.pos) for w in words})
        num = len(distinct)
        word = [d[0] for d in distinct]
        pos = [d[1] for d in distinct]
        tokenizer = [self.tokenizer for _ in range(num)]
        quality = [0.0 for _ in range(num)]
        titlefreq = [0 for _ in range(num)]
        contentfreq = [0 for _ in range(num)]
        commentfreq = [0 for _ in range(num)]
        stopword = [False for _ in range(num)]
        psql = PsqlQuery()
        vocab_bundle = psql.upsert(self.insert_vocab_sql, locals())
        returned_schema = dict(id=0, word=1, pos=2, tokenizer=3)
        return vocab_bundle, returned_schema
Beispiel #26
0
    def insert_vocab_ignore_docfreq(self,
                                    batch,
                                    tokenized_field='title_tokenized'):
        allpairs = [pair for body in batch for pair in body[tokenized_field]]

        distinct = list({(pair['word'], pair['pos']) for pair in allpairs})
        num = len(distinct)
        word = [d[0] for d in distinct]
        pos = [d[1] for d in distinct]
        tokenizer = [self.tokenizer for _ in range(num)]
        quality = [0.0 for _ in range(num)]
        postfreq = [0 for _ in range(num)]
        # titlefreq = [0 for _ in range(num)]
        # contentfreq = [0 for _ in range(num)]
        commentfreq = [0 for _ in range(num)]
        stopword = [False for _ in range(num)]
        psql = PsqlQuery()
        vocab_bundle = psql.upsert(self.insert_vocab_sql, locals())
        returned_schema = dict(id=0, word=1, pos=2, tokenizer=3)
        return vocab_bundle, returned_schema
Beispiel #27
0
def query_freq_sum():

    query_freq_sum_sql = '''
        SELECT SUM(postfreq) AS postfreq_sum,
               SUM(commentfreq) AS commentfreq_sum
        FROM pttcorpus_vocabulary;
    '''

    query_vocab_pairfreq_sum_sql = '''
        SELECT SUM(pxy) AS sum
        FROM pttcorpus_association;
    '''

    psql = PsqlQuery()
    postfreq_sum, commentfreq_sum = list(psql.query(query_freq_sum_sql))[0]
    logger.info('postfreq_sum:{}, commentfreq_sum:{}'.format(
        postfreq_sum, commentfreq_sum))
    vocab_pairfreq_sum = list(psql.query(query_vocab_pairfreq_sum_sql))[0][0]
    logger.info('vocab_pairfreq_sum:{}'.format(vocab_pairfreq_sum))

    return postfreq_sum, commentfreq_sum, vocab_pairfreq_sum
Beispiel #28
0
    def upsert_vocab_pairfreq(self, vocab_id, batch_size):
        vocab_id = list(set(vocab_id))
        qpost, schema = self._query_all(self.query_post_by_vid_sql,
                                        (tuple(vocab_id), ))
        qpost_lists = [p2v[schema['post_id']] for p2v in qpost]
        if len(qpost_lists) > 0:
            cnter_result = self.vocab_pair_counter(qpost_lists)
            vocab_cnt = {
                vocab_pair: cnter_result[vocab_pair]
                for vocab_pair in cnter_result.keys()
                if int(vocab_pair[0]) in vocab_id
            }

            vocabt_all = []
            vocabc_all = []
            pxy_all = []
            tokenizer_all = ['jieba'] * len(vocab_cnt)
            for k, v in vocab_cnt.items():
                vocabt_all.append(int(k[0]))
                vocabc_all.append(int(k[1]))
                pxy_all.append(v)

            batch_vocabt = self.batch_list(vocabt_all, batch_size)
            batch_vocabc = self.batch_list(vocabc_all, batch_size)
            batch_pxy = self.batch_list(pxy_all, batch_size)
            batch_tokenizer = self.batch_list(tokenizer_all, batch_size)

            for vocabt_id, vocabc_id, tokenizer, pxy in zip(
                    batch_vocabt, batch_vocabc, batch_tokenizer, batch_pxy):
                psql = PsqlQuery()
                psql.upsert(
                    self.upsert_vocab_pairfreq_sql, {
                        'vocabt_id': vocabt_id,
                        'vocabc_id': vocabc_id,
                        'tokenizer': tokenizer,
                        'pxy': pxy
                    })
Beispiel #29
0
def generate_random_post(ref):
    psql = PsqlQuery()

    posts = psql.query(query_random_post_sql)

    return [p[0] for p in posts][:len(ref)]
Beispiel #30
0

def extract_words(comments):
    if not bool(comments):
        return []

    def extract(cmt):
        return [v for v in cmt.vocabs]

    return [extract(cmt) for cmt in comments]


if __name__ == '__main__':
    with open('eval0829.csv', 'w') as f:
        f.write('random, base, pweight\n')
    psql = PsqlQuery()
    posts = psql.query(query_post_sql)
    pschema = psql.schema

    valid_post = 0

    for idx, p in enumerate(posts):
        titles, tschema = psql.query_all(
            query_title_sql, dict(pid=p[pschema['id']], tok='jieba'))

        basic_retriever = RetrievalEvaluate(
            'jieba',
            excluded_post_ids=[p[pschema['id']]],
            logger_name='retrieve')

        pweight_retriever = RetrievalEvaluate(