Beispiel #1
0
    def upsert_vocab2comment(self,
                             batch_comment,
                             comment_id,
                             vocab_bundle,
                             vschema,
                             tokenized_field='comment_tokenized'):

        tokenized = [[(k['word'], k['pos'], self.tokenizer)
                      for k in p[tokenized_field]] for p in batch_comment]
        vocab2comment = []

        for vocab in vocab_bundle:
            vtuple = (vocab[vschema['word']], vocab[vschema['pos']],
                      vocab[vschema['tokenizer']])
            comment_id_with_vocab = [
                cmt for idx, cmt in enumerate(comment_id)
                if vtuple in tokenized[idx]
            ]
            vocab2comment.append([(vocab[vschema['id']], cid)
                                  for cid in comment_id_with_vocab])

        flatten_vocab2cmt = [tup for v2c in vocab2comment for tup in v2c]

        vocabulary_id = [v2c[0] for v2c in flatten_vocab2cmt]
        cmt_id = [v2c[1] for v2c in flatten_vocab2cmt]

        psql = PsqlQuery()
        psql.upsert(self.upsert_vocab2comment_sql, {
            'vocabulary_id': vocabulary_id,
            'comment_id': cmt_id
        })

        return vocabulary_id
Beispiel #2
0
    def upsert_vocab2post(self,
                          batch_post,
                          post_id,
                          vocab_bundle,
                          vschema,
                          tokenized_field='title_tokenized'):
        tokenized = [[(k['word'], k['pos'], self.tokenizer)
                      for k in p[tokenized_field]] for p in batch_post]

        vocab2post = []
        for vocab in vocab_bundle:
            vtubple = (vocab[vschema['word']], vocab[vschema['pos']],
                       vocab[vschema['tokenizer']])
            post_id_with_vocab = [
                p for idx, p in enumerate(post_id) if vtubple in tokenized[idx]
            ]
            vocab2post.append([(vocab[vschema['id']], pid)
                               for pid in post_id_with_vocab])

        flatten_vocab2post = [tup for v2p in vocab2post for tup in v2p]

        vocabulary_id = [v2p[0] for v2p in flatten_vocab2post]
        flatten_post_id = [v2p[1] for v2p in flatten_vocab2post]

        psql = PsqlQuery()
        psql.upsert(self.upsert_vocab2post_sql, {
            'vocabulary_id': vocabulary_id,
            'post_id': flatten_post_id
        })

        return vocabulary_id
Beispiel #3
0
    def insert_title(self,
                     batch_post,
                     post_id,
                     tokenized_field='title_tokenized',
                     type_field='ctype'):

        num = len(batch_post)
        # qpost, pschema = self.query_post(post_url)
        tokenized = [
            ' '.join([k['word'] for k in p[tokenized_field]])
            for p in batch_post
        ]
        grammar = [
            ' '.join([k['pos'] for k in p[tokenized_field]])
            for p in batch_post
        ]
        # post_id = [p[pschema['id']] for p in post_bundle]
        ctype = [p[type_field] for p in batch_post]
        tokenizer = [self.tokenizer] * num
        retrieval_count = [0] * num
        quality = [0.0 for _ in range(num)]

        psql = PsqlQuery()
        title_id = psql.upsert(self.insert_title_sql, locals())

        return [t[0] for t in title_id]
Beispiel #4
0
    def update_title_quality(self, id_to_update, quality):
        psql = PsqlQuery()

        return psql.upsert(self.update_title_quality_sql, {
            'id_': id_to_update,
            'quality': quality
        })
def upsert_post(batch_post):
    timestamp = [post['timestamp'] for post in batch_post]
    _title = [post['title'] for post in batch_post]
    _url = [post['url'] for post in batch_post]
    _author = [post['author'] for post in batch_post]
    _content = [post['content'] for post in batch_post]
    _comment = [post['comment'] for post in batch_post]
    comment_len = [len(cmt) for cmt in _comment]
    sorted_idx = np.argsort(comment_len)[::-1]
    title, url, author, content, comment, publish_date = [], [], [], [], [], []

    for idx in sorted_idx:
        if _url[idx] not in url:
            url.append(_url[idx])
            title.append(_title[idx])
            author.append(_author[idx])
            content.append(_content[idx])
            comment.append(_comment[idx])
            publish_date.append(datetime.fromtimestamp(timestamp[idx]))

    post_id = []
    try:
        psql = PsqlQuery()
        post_id = psql.upsert(upsert_post_sql, locals())
    except Exception as e:
        oklogger.logger.error(e)
        oklogger.logger.error(title)
        raise e

    return post_id
Beispiel #6
0
 def insert_netizen(self, raw_name):
     name = list(set(raw_name))
     num = len(name)
     quality = [0.0 for _ in range(num)]
     posts = [0 for _ in range(num)]
     comments = [0 for _ in range(num)]
     psql = PsqlQuery()
     ids = psql.upsert(self.insert_netizen_sql, locals())
     return [i[0] for i in ids]
Beispiel #7
0
    def insert_comment(self,
                       comments,
                       batch_field='comments',
                       url_field='url',
                       tokenized_field='comment_tokenized',
                       type_field='ctype',
                       floor_field='floor',
                       audience_field='audience'):

        batch_comment = []
        for batch in comments:
            batch_comment.extend(batch[batch_field])

        post_url = [batch['url'] for batch in comments]
        if len(post_url) != len(set(post_url)):
            raise LengthNotMatchException

        num = len(batch_comment)
        qpost, pschema = self.query_post(post_url)

        tokenized = [
            ' '.join([k['word'] for k in cmt[tokenized_field]])
            for cmt in batch_comment
        ]
        grammar = [
            ' '.join([k['pos'] for k in cmt[tokenized_field]])
            for cmt in batch_comment
        ]

        ctype = [cmt[type_field] for cmt in batch_comment]
        floor = [cmt[floor_field] for cmt in batch_comment]

        audience = [cmt[audience_field] for cmt in batch_comment]
        self.insert_netizen(audience)

        tokenizer = [self.tokenizer] * num
        retrieval_count = [0] * num
        quality = [0.0 for _ in range(num)]

        post_id = []
        try:
            for idx, (batch, p) in enumerate(zip(comments, qpost)):
                post_id.extend([p[pschema['id']]] * len(batch[batch_field]))
        except Exception as err:
            self.logger.error(
                'It\'s impossible to insert Comments while Post doesn\'t exist. url: {}'
                .format(post_url[idx]))
            raise err

        psql = PsqlQuery()
        comment_id = psql.upsert(self.insert_comment_sql, locals())

        return [cmt[0] for cmt in comment_id], batch_comment
Beispiel #8
0
    def upsert_vocab_pairfreq(self, vocab_id, batch_size):
        vocab_id = list(set(vocab_id))
        qpost, schema = self._query_all(self.query_post_by_vid_sql,
                                        (tuple(vocab_id), ))
        qpost_lists = [p2v[schema['post_id']] for p2v in qpost]
        if len(qpost_lists) > 0:
            cnter_result = self.vocab_pair_counter(qpost_lists)
            vocab_cnt = {
                vocab_pair: cnter_result[vocab_pair]
                for vocab_pair in cnter_result.keys()
                if int(vocab_pair[0]) in vocab_id
            }

            vocabt_all = []
            vocabc_all = []
            pxy_all = []
            tokenizer_all = ['jieba'] * len(vocab_cnt)
            for k, v in vocab_cnt.items():
                vocabt_all.append(int(k[0]))
                vocabc_all.append(int(k[1]))
                pxy_all.append(v)

            batch_vocabt = self.batch_list(vocabt_all, batch_size)
            batch_vocabc = self.batch_list(vocabc_all, batch_size)
            batch_pxy = self.batch_list(pxy_all, batch_size)
            batch_tokenizer = self.batch_list(tokenizer_all, batch_size)

            for vocabt_id, vocabc_id, tokenizer, pxy in zip(
                    batch_vocabt, batch_vocabc, batch_tokenizer, batch_pxy):
                psql = PsqlQuery()
                psql.upsert(
                    self.upsert_vocab_pairfreq_sql, {
                        'vocabt_id': vocabt_id,
                        'vocabc_id': vocabc_id,
                        'tokenizer': tokenizer,
                        'pxy': pxy
                    })
Beispiel #9
0
    def upsert_post(self,
                    batch_post,
                    title_raw_field='title_raw',
                    title_cleaned_field='title_cleaned',
                    comment_raw_field='comment_raw',
                    comment_cleaned_field='comment_cleaned',
                    tag_field='tag',
                    url_field='url',
                    spider_field='spider',
                    author_field='author',
                    publish_date_field='date'):

        post_num = len(batch_post)

        title_raw = [p[title_raw_field] for p in batch_post]
        title_cleaned = [p[title_cleaned_field] for p in batch_post]
        comment_raw = [p[comment_raw_field] for p in batch_post]
        comment_cleaned = [p[comment_cleaned_field] for p in batch_post]
        url = [p[url_field] for p in batch_post]
        if len(url) != len(set(url)):
            raise LengthNotMatchException

        tag = [p[tag_field] for p in batch_post]
        author = [
            p[author_field][:p[author_field].find('(')].strip()
            for p in batch_post
        ]
        self.insert_netizen(author)
        publish_date = [p[publish_date_field] for p in batch_post]
        spider = [p[spider_field] for p in batch_post]
        last_update = [datetime.now()] * post_num
        quality = [0.0 for _ in range(post_num)]
        update_count = [1] * post_num
        allow_update = [True] * post_num

        # qpost, schema = self.query_post(url)
        # for i, q in enumerate(qpost):
        #     if q:
        #         if len(q[schema['push']]) == len(push[i]):
        #             allow_update[i] = False
        post_id = []
        try:
            psql = PsqlQuery()
            post_id = psql.upsert(self.upsert_post_sql, locals())
        except Exception as e:
            self.logger.error(e)
            raise e

        return [p[0] for p in post_id]
Beispiel #10
0
    def insert_vocab_ignore_docfreq(self, words):

        distinct = list({(w.word, w.pos) for w in words})
        num = len(distinct)
        word = [d[0] for d in distinct]
        pos = [d[1] for d in distinct]
        tokenizer = [self.tokenizer for _ in range(num)]
        quality = [0.0 for _ in range(num)]
        titlefreq = [0 for _ in range(num)]
        contentfreq = [0 for _ in range(num)]
        commentfreq = [0 for _ in range(num)]
        stopword = [False for _ in range(num)]
        psql = PsqlQuery()
        vocab_bundle = psql.upsert(self.insert_vocab_sql, locals())
        returned_schema = dict(id=0, word=1, pos=2, tokenizer=3)
        return vocab_bundle, returned_schema
Beispiel #11
0
    def insert_vocab_ignore_docfreq(self,
                                    batch,
                                    tokenized_field='title_tokenized'):
        allpairs = [pair for body in batch for pair in body[tokenized_field]]

        distinct = list({(pair['word'], pair['pos']) for pair in allpairs})
        num = len(distinct)
        word = [d[0] for d in distinct]
        pos = [d[1] for d in distinct]
        tokenizer = [self.tokenizer for _ in range(num)]
        quality = [0.0 for _ in range(num)]
        postfreq = [0 for _ in range(num)]
        # titlefreq = [0 for _ in range(num)]
        # contentfreq = [0 for _ in range(num)]
        commentfreq = [0 for _ in range(num)]
        stopword = [False for _ in range(num)]
        psql = PsqlQuery()
        vocab_bundle = psql.upsert(self.insert_vocab_sql, locals())
        returned_schema = dict(id=0, word=1, pos=2, tokenizer=3)
        return vocab_bundle, returned_schema
Beispiel #12
0
 def update_vocab_quality(self, word, quality):
     psql = PsqlQuery()
     return psql.upsert(self.update_vocab_quality_sql, {
         'word': word,
         'quality': quality
     })