Exemple #1
0
    def _upsert_cache(self, push):
        if bool(self.user):
            repeat = 0
            if bool(self.cache):
                if self.cache[
                        self.cschema['query']].strip() == self.query.strip():
                    repeat = self.cache[self.cschema['repeat']] + 1

            psql = PsqlQuery()
            data = {
                'user_id': self.user[self.uschema['id']],
                'query': self.query,
                'keyword': self.keyword,
                'reply': push,
                'time': self.event_time,
                'repeat': repeat,
                'post': self.post_ref,
                'push_num': len(self.push_pool),
                'tree_node': self.chat_tree_id
            }

            try:
                psql.upsert(self.upsert_chatcache_sql, data)
            except Exception as e:
                self.logger.error('Upsert ChatCache failed: {}'.format(e))
Exemple #2
0
    def upsert_post(self, batch_post):
        post_num = len(batch_post)
        
        title = [p['title'] for p in batch_post]
        tokenized = [p['title_vocab'] for p in batch_post]
        grammar = [p['title_grammar'] for p in batch_post]
        url = [p['url'] for p in batch_post]
        tag = [p['tag'] for p in batch_post]
        author = [p['author'] for p in batch_post]
        push = [p['push'] for p in batch_post]
        publish_date = [p['date'] for p in batch_post]
        spider = [self.spider_tag] * post_num
        last_update = [timezone.now()] * post_num
        update_count = [1] * post_num
        allow_update = [True] * post_num

        # qpost, schema = self.query_post(url)
        # for i, q in enumerate(qpost):
        #     if q:
        #         if len(q[schema['push']]) == len(push[i]):
        #             allow_update[i] = False
        try:
            psql = PsqlQuery()
            psql.upsert(self.upsert_post_sql, locals())
        except Exception as e:
            logger.error(e)
            raise e

        return url
Exemple #3
0
    def _query_vocab(self, w2v=False):
        vocab_name = [
            '--+--'.join([t.word, t.flag, self.default_tokenizer])
            for t in self.tok
        ]
        vocab_score = {name: 1.0 for name in vocab_name}

        # TODO: merge word2vec model here
        # ===============================
        if w2v and bool(Chat.w2v_model):
            try:
                w2v_query = [
                    '{}:{}'.format(word, flag)
                    for word, flag in zip(self.words, self.flags)
                    if flag[0] in ['v', 'n'] or flag == 'eng'
                ]
                if bool(w2v_query):
                    w2v_neighbor = Chat.w2v_model.most_similar(
                        positive=w2v_query, topn=min(3, len(w2v_query)))

                    w2v_name = [
                        '--+--'.join('{}:{}'.format(
                            w[0], self.default_tokenizer).split(':'))
                        for w in w2v_neighbor
                    ]
                    w2v_score = [w[1] for w in w2v_neighbor]

                    for name, score in zip(w2v_name, w2v_score):
                        vocab_score[name] = score

                    vocab_name.extend(w2v_name)
            except:
                pass

        psql = PsqlQuery()
        qvocab = list(psql.query(self.query_vocab_sql, (tuple(vocab_name), )))

        vschema = psql.schema
        _tag_weight = {
            q[vschema['tag']]: Chat.tag_weight[q[vschema['tag']]]['weight']
            if q[vschema['tag']] in Chat.tag_weight else 1.0
            for q in qvocab
        }
        # ===============================
        self.vocab = [{
            'word':
            ':'.join([q[vschema['word']], q[vschema['tag']]]),
            'termweight':
            _tag_weight[q[vschema['tag']]] * vocab_score[q[vschema['name']]],
            'docfreq':
            q[vschema['doc_freq']]
        } for q in qvocab]

        self.vid = [
            q[vschema['id']] for q in qvocab if not (q[vschema['stopword']])
            and q[vschema['doc_freq']] < self.vocab_docfreq_th
        ]
Exemple #4
0
    def _update_vocab_docfreq(self, vocab_id):
        qvocab2post, schema = self._query_all(self.query_vocab2post, (tuple(vocab_id),))
        qvocab_id = [v2p[schema['vocabulary_id']] for v2p in qvocab2post]

        vocab_cnt = collections.Counter(qvocab_id)
        id_ = list(vocab_cnt.keys())
        freq = list(vocab_cnt.values())

        psql = PsqlQuery()
        psql.upsert(self.update_vocab_docfreq_sql, {'id_':id_, 'freq': freq})
Exemple #5
0
 def _update_chattree(self):
     if bool(self.cache) and self.cache[self.cschema['tree_node']] > 0:
         try:
             psql = PsqlQuery()
             psql.upsert(
                 self.update_chattree_sql, {
                     'successor': self.chat_tree_id,
                     'id_': self.cache[self.cschema['tree_node']]
                 })
         except Exception as e:
             self.logger.error('Update ChatTree failed: {}'.format(e))
Exemple #6
0
    def _query_post(self, vid):

        _query_pid = list(PsqlQuery().query(self.query_vocab2post_sql,
                                            (tuple(vid), )))
        query_pid = [
            p[0] for p in _query_pid
            if p[0] != self.topic_post[self.topic_pschema['id']]
        ]
        psql = PsqlQuery()
        allpost = psql.query(self.query_post_sql, (tuple(query_pid), ))
        return allpost, psql.schema
Exemple #7
0
    def _get_user(self):
        user, schema = None, {}
        psql = PsqlQuery()
        user_ = list(
            psql.query(self.query_chatuser_sql, {
                'uid': self.uid,
                'platform': self.platform
            }))
        if bool(user_):
            user = user_[0]
            schema = psql.schema

        return user, schema
    def _update_job_result(self, jobname, result):
        try:
            psql = PsqlQuery()
            update_joblog_result = '''
                UPDATE crawl_app_joblog
                SET result=%(result)s
                WHERE name = %(name)s;
            '''
            psql.upsert(update_joblog_result, {'name': jobname, 'result': result})

        except Exception as e:
            logger.error(e)
            pass
Exemple #9
0
    def upsert_vocab_ignore_docfreq(self, batch_post):
        allpairs = [pair for post in batch_post for pair in post['title_tok']]
        name = list({'--+--'.join([pair.word, pair.flag, self.tok_tag]) for pair in allpairs})
        num = len(name)
        groups = [nm.split('--+--') for nm in name]
        word = [g[0] for g in groups]
        tag = [g[1] for g in groups]
        tokenizer = [g[2] for g in groups]
        doc_freq = [-1 for g in groups]
        stopword = [False for g in groups]
        psql = PsqlQuery()
        psql.upsert(self.upsert_vocab_sql, locals())

        return name
Exemple #10
0
 def _upsert_user(self, active=False, state=0):
     psql = PsqlQuery()
     data = {
         'platform': self.platform,
         'uid': self.uid,
         'idtype': self.idtype,
         'active': active,
         'state': state,
         'chat_count': 0
     }
     try:
         psql.upsert(self.upsert_chatuser_sql, data)
     except Exception as e:
         self.logger.error('Upsert ChatUser failed: {}'.format(e))
Exemple #11
0
    def query_oldpost_batch(self, batch_size=1000):
        psql = PsqlQuery()
        fetched = psql.query(self.query_post_sql, (self.fromdate, ))
        schema = psql.schema

        batch, i = [], 0
        for qpost in fetched:
            batch.append(qpost)
            i += 1
            if i >= batch_size:
                i = 0
                yield batch, schema
                batch = []

        yield batch, schema
Exemple #12
0
    def _query_cache(self):
        cache, schema = None, {}
        psql = PsqlQuery()
        try:
            cache_ = list(
                psql.query(self.query_chatcache_sql,
                           (self.user[self.uschema['id']], )))
            if bool(cache_):
                cache = cache_[0]
                schema = psql.schema

        except Exception as e:
            self.logger.warning(e)

        return cache, schema
Exemple #13
0
    def _query_post(self):
        self.keyword = json.dumps(self.vocab,
                                  indent=4,
                                  ensure_ascii=False,
                                  sort_keys=True)
        self.logger.info(self.keyword)
        slack_log = '\n====================\nreleated keywords:\t' + '\t'.join(
            v['word'] for v in self.vocab)
        data = '{"text": \"' + slack_log + '\"}'
        requests.post(self.SLACK_WEBHOOK,
                      headers={'Content-type': 'application/json'},
                      data=data.encode('utf8'))

        query_pid = list(PsqlQuery().query(self.query_vocab2post_sql,
                                           (tuple(self.vid), )))
        psql = PsqlQuery()
        self.allpost = psql.query(self.query_post_sql, (tuple(query_pid), ))
        self.pschema = psql.schema
Exemple #14
0
    def upsert_vocab2post(self, batch_post, vocab_name, post_url):
        qvocab, vschema = self.query_vocab(vocab_name)
        qpost, pschema = self.query_post(post_url)
        title_tok_name = [['--+--'.join([k.word, k.flag, self.tok_tag]) for k in p['title_tok']] for p in batch_post]

        vocab2post = []
        for vocab in qvocab:
            post_id_with_vocab = [p[pschema['id']] for idx, p in enumerate(qpost) if vocab[vschema['name']] in title_tok_name[idx]]
            vocab2post.append([(vocab[vschema['id']], pid) for pid in post_id_with_vocab])

        flatten_vocab2post = [tup for v2p in vocab2post for tup in v2p]

        vocabulary_id = [v2p[0] for v2p in flatten_vocab2post]
        post_id = [v2p[1] for v2p in flatten_vocab2post]

        psql = PsqlQuery()
        psql.upsert(self.upsert_vocab2post_sql, {'vocabulary_id': vocabulary_id, 'post_id': post_id})

        self._update_vocab_docfreq(vocabulary_id)
Exemple #15
0
 def _insert_chattree(self, push):
     ancestor = -1
     if bool(self.cache):
         ancestor = self.cache[self.cschema['tree_node']]
     try:
         data = {
             'user_id': self.user[self.uschema['id']],
             'ancestor': ancestor,
             'query': self.query,
             'keyword': self.keyword,
             'reply': push,
             'time': self.event_time,
             'post': self.post_ref,
             'push_num': len(self.push_pool)
         }
         psql = PsqlQuery()
         self.chat_tree_id = psql.insert_with_col_return(
             self.insert_chattree_sql, data)
     except Exception as e:
         self.logger.error('Insert ChatTree failed: {}'.format(e))
Exemple #16
0
    def clean_oldpost(self, batch_post, pschema):
        post_id = [p[pschema['id']] for p in batch_post]

        vocab2post, v2pschema = self._query_all(
            self.query_vocab2post_sql_by_post, (tuple(post_id), ))

        v2p_id = [v2p[v2pschema['id']] for v2p in vocab2post]
        vocab_id = list(
            {v2p[v2pschema['vocabulary_id']]
             for v2p in vocab2post})

        psql = PsqlQuery()
        psql.delete(self.delete_vocab2post_sql, (tuple(v2p_id), ))
        psql.delete(self.delete_post_sql, (tuple(post_id), ))

        self._update_vocab_docfreq(vocab_id)
Exemple #17
0
def __pipeline(spider_tag):
    sp.call(SPIDER_UPDATE.format(spider_tag).split())
    r = sp.check_output(SPIDER_CRAWL.format(spider_tag).split())
    filename = '{}.jl'.format(r.decode('utf-8').strip())
    complete_filepath = '{}/{}'.format(SPIDER_OUTPUT_ROOT, filename)
    if not os.path.isfile(complete_filepath):
        logger.error(
            'okbot cronjob: crawled file: {} not found, cronjob abort.'.format(
                complete_filepath))
        return -1

    else:
        sp.call(SPIDER_INGEST.format(complete_filepath, 'jieba').split())
        logger.info(
            'okbot cronjob: crawl/ingest: {} finished.'.format(filename))


if __name__ == '__main__':
    psql = PsqlQuery()
    allspiders = psql.query('SELECT tag, freq FROM crawl_app_spider;')
    schema = psql.schema
    for spider in allspiders:
        tag = spider[schema['tag']]
        freq = spider[schema['freq']]
        if freq > 0:
            delta = (datetime.datetime.today() -
                     datetime.datetime(1970, 1, 1)).days
            if delta % freq == 0:
                __pipeline(tag)
#    __pipeline('Gossiping')
Exemple #18
0
 def _query_all(self, sql_string, data=None):
     psql = PsqlQuery()
     fetched = list(psql.query(sql_string, data))
     schema = psql.schema
     return fetched, schema
Exemple #19
0
 def draw(self):
     psql = PsqlQuery()
     self.topic_post = list(psql.query(self.random_query_sql))[0]
     self.topic_pschema = psql.schema
Exemple #20
0
    def _query_vocab(self,
                     tokenizer='jieba',
                     w2v_model=None,
                     jiebatag_weight={}):
        words = self.topic_post[self.topic_pschema['tokenized']].split()
        flags = self.topic_post[self.topic_pschema['grammar']].split()

        # self.tok, self.words, self.flags = Tokenizer(tokenizer).cut(self.post[self.pschema['title']])

        vocab_name = [
            '--+--'.join([w, f, tokenizer]) for w, f in zip(words, flags)
        ]
        vocab_score = {name: 1.0 for name in vocab_name}

        # Merge word2vec model here
        # ===============================
        if bool(w2v_model):
            try:
                w2v_query = [
                    '{}:{}'.format(w, f) for w, f in zip(words, flags)
                    if f[0] in ['v', 'n'] or f in ['eng']
                ]
                if bool(w2v_query):
                    w2v_neighbor = w2v_model.most_similar(positive=w2v_query,
                                                          topn=min(
                                                              3,
                                                              len(w2v_query)))

                    w2v_name = [
                        '--+--'.join('{}:{}'.format(w[0],
                                                    tokenizer).split(':'))
                        for w in w2v_neighbor
                    ]
                    w2v_score = [w[1] for w in w2v_neighbor]

                    for name, score in zip(w2v_name, w2v_score):
                        vocab_score[name] = score

                    vocab_name.extend(w2v_name)
            except:
                self.logger.warning('word2vec query failed.')
                pass

        psql = PsqlQuery()
        qvocab = list(psql.query(self.query_vocab_sql, (tuple(vocab_name), )))

        vschema = psql.schema
        _tag_weight = {
            q[vschema['tag']]: jiebatag_weight[q[vschema['tag']]]['weight']
            if q[vschema['tag']] in jiebatag_weight else 1.0
            for q in qvocab
        }
        # ===============================

        vocab = [{
            'word':
            ':'.join([q[vschema['word']], q[vschema['tag']]]),
            'termweight':
            _tag_weight[q[vschema['tag']]] * vocab_score[q[vschema['name']]],
            'docfreq':
            q[vschema['doc_freq']]
        } for q in qvocab]

        # keyword = json.dumps(vocab, indent=4, ensure_ascii=False, sort_keys=True)
        # self.logger.info(keyword)

        vid = [
            q[vschema['id']] for q in qvocab if not (q[vschema['stopword']])
            and q[vschema['doc_freq']] < self.vocab_docfreq_th
        ]

        return vocab, vid