def _upsert_cache(self, push): if bool(self.user): repeat = 0 if bool(self.cache): if self.cache[ self.cschema['query']].strip() == self.query.strip(): repeat = self.cache[self.cschema['repeat']] + 1 psql = PsqlQuery() data = { 'user_id': self.user[self.uschema['id']], 'query': self.query, 'keyword': self.keyword, 'reply': push, 'time': self.event_time, 'repeat': repeat, 'post': self.post_ref, 'push_num': len(self.push_pool), 'tree_node': self.chat_tree_id } try: psql.upsert(self.upsert_chatcache_sql, data) except Exception as e: self.logger.error('Upsert ChatCache failed: {}'.format(e))
def upsert_post(self, batch_post): post_num = len(batch_post) title = [p['title'] for p in batch_post] tokenized = [p['title_vocab'] for p in batch_post] grammar = [p['title_grammar'] for p in batch_post] url = [p['url'] for p in batch_post] tag = [p['tag'] for p in batch_post] author = [p['author'] for p in batch_post] push = [p['push'] for p in batch_post] publish_date = [p['date'] for p in batch_post] spider = [self.spider_tag] * post_num last_update = [timezone.now()] * post_num update_count = [1] * post_num allow_update = [True] * post_num # qpost, schema = self.query_post(url) # for i, q in enumerate(qpost): # if q: # if len(q[schema['push']]) == len(push[i]): # allow_update[i] = False try: psql = PsqlQuery() psql.upsert(self.upsert_post_sql, locals()) except Exception as e: logger.error(e) raise e return url
def _query_vocab(self, w2v=False): vocab_name = [ '--+--'.join([t.word, t.flag, self.default_tokenizer]) for t in self.tok ] vocab_score = {name: 1.0 for name in vocab_name} # TODO: merge word2vec model here # =============================== if w2v and bool(Chat.w2v_model): try: w2v_query = [ '{}:{}'.format(word, flag) for word, flag in zip(self.words, self.flags) if flag[0] in ['v', 'n'] or flag == 'eng' ] if bool(w2v_query): w2v_neighbor = Chat.w2v_model.most_similar( positive=w2v_query, topn=min(3, len(w2v_query))) w2v_name = [ '--+--'.join('{}:{}'.format( w[0], self.default_tokenizer).split(':')) for w in w2v_neighbor ] w2v_score = [w[1] for w in w2v_neighbor] for name, score in zip(w2v_name, w2v_score): vocab_score[name] = score vocab_name.extend(w2v_name) except: pass psql = PsqlQuery() qvocab = list(psql.query(self.query_vocab_sql, (tuple(vocab_name), ))) vschema = psql.schema _tag_weight = { q[vschema['tag']]: Chat.tag_weight[q[vschema['tag']]]['weight'] if q[vschema['tag']] in Chat.tag_weight else 1.0 for q in qvocab } # =============================== self.vocab = [{ 'word': ':'.join([q[vschema['word']], q[vschema['tag']]]), 'termweight': _tag_weight[q[vschema['tag']]] * vocab_score[q[vschema['name']]], 'docfreq': q[vschema['doc_freq']] } for q in qvocab] self.vid = [ q[vschema['id']] for q in qvocab if not (q[vschema['stopword']]) and q[vschema['doc_freq']] < self.vocab_docfreq_th ]
def _update_vocab_docfreq(self, vocab_id): qvocab2post, schema = self._query_all(self.query_vocab2post, (tuple(vocab_id),)) qvocab_id = [v2p[schema['vocabulary_id']] for v2p in qvocab2post] vocab_cnt = collections.Counter(qvocab_id) id_ = list(vocab_cnt.keys()) freq = list(vocab_cnt.values()) psql = PsqlQuery() psql.upsert(self.update_vocab_docfreq_sql, {'id_':id_, 'freq': freq})
def _update_chattree(self): if bool(self.cache) and self.cache[self.cschema['tree_node']] > 0: try: psql = PsqlQuery() psql.upsert( self.update_chattree_sql, { 'successor': self.chat_tree_id, 'id_': self.cache[self.cschema['tree_node']] }) except Exception as e: self.logger.error('Update ChatTree failed: {}'.format(e))
def _query_post(self, vid): _query_pid = list(PsqlQuery().query(self.query_vocab2post_sql, (tuple(vid), ))) query_pid = [ p[0] for p in _query_pid if p[0] != self.topic_post[self.topic_pschema['id']] ] psql = PsqlQuery() allpost = psql.query(self.query_post_sql, (tuple(query_pid), )) return allpost, psql.schema
def _get_user(self): user, schema = None, {} psql = PsqlQuery() user_ = list( psql.query(self.query_chatuser_sql, { 'uid': self.uid, 'platform': self.platform })) if bool(user_): user = user_[0] schema = psql.schema return user, schema
def _update_job_result(self, jobname, result): try: psql = PsqlQuery() update_joblog_result = ''' UPDATE crawl_app_joblog SET result=%(result)s WHERE name = %(name)s; ''' psql.upsert(update_joblog_result, {'name': jobname, 'result': result}) except Exception as e: logger.error(e) pass
def upsert_vocab_ignore_docfreq(self, batch_post): allpairs = [pair for post in batch_post for pair in post['title_tok']] name = list({'--+--'.join([pair.word, pair.flag, self.tok_tag]) for pair in allpairs}) num = len(name) groups = [nm.split('--+--') for nm in name] word = [g[0] for g in groups] tag = [g[1] for g in groups] tokenizer = [g[2] for g in groups] doc_freq = [-1 for g in groups] stopword = [False for g in groups] psql = PsqlQuery() psql.upsert(self.upsert_vocab_sql, locals()) return name
def _upsert_user(self, active=False, state=0): psql = PsqlQuery() data = { 'platform': self.platform, 'uid': self.uid, 'idtype': self.idtype, 'active': active, 'state': state, 'chat_count': 0 } try: psql.upsert(self.upsert_chatuser_sql, data) except Exception as e: self.logger.error('Upsert ChatUser failed: {}'.format(e))
def query_oldpost_batch(self, batch_size=1000): psql = PsqlQuery() fetched = psql.query(self.query_post_sql, (self.fromdate, )) schema = psql.schema batch, i = [], 0 for qpost in fetched: batch.append(qpost) i += 1 if i >= batch_size: i = 0 yield batch, schema batch = [] yield batch, schema
def _query_cache(self): cache, schema = None, {} psql = PsqlQuery() try: cache_ = list( psql.query(self.query_chatcache_sql, (self.user[self.uschema['id']], ))) if bool(cache_): cache = cache_[0] schema = psql.schema except Exception as e: self.logger.warning(e) return cache, schema
def _query_post(self): self.keyword = json.dumps(self.vocab, indent=4, ensure_ascii=False, sort_keys=True) self.logger.info(self.keyword) slack_log = '\n====================\nreleated keywords:\t' + '\t'.join( v['word'] for v in self.vocab) data = '{"text": \"' + slack_log + '\"}' requests.post(self.SLACK_WEBHOOK, headers={'Content-type': 'application/json'}, data=data.encode('utf8')) query_pid = list(PsqlQuery().query(self.query_vocab2post_sql, (tuple(self.vid), ))) psql = PsqlQuery() self.allpost = psql.query(self.query_post_sql, (tuple(query_pid), )) self.pschema = psql.schema
def upsert_vocab2post(self, batch_post, vocab_name, post_url): qvocab, vschema = self.query_vocab(vocab_name) qpost, pschema = self.query_post(post_url) title_tok_name = [['--+--'.join([k.word, k.flag, self.tok_tag]) for k in p['title_tok']] for p in batch_post] vocab2post = [] for vocab in qvocab: post_id_with_vocab = [p[pschema['id']] for idx, p in enumerate(qpost) if vocab[vschema['name']] in title_tok_name[idx]] vocab2post.append([(vocab[vschema['id']], pid) for pid in post_id_with_vocab]) flatten_vocab2post = [tup for v2p in vocab2post for tup in v2p] vocabulary_id = [v2p[0] for v2p in flatten_vocab2post] post_id = [v2p[1] for v2p in flatten_vocab2post] psql = PsqlQuery() psql.upsert(self.upsert_vocab2post_sql, {'vocabulary_id': vocabulary_id, 'post_id': post_id}) self._update_vocab_docfreq(vocabulary_id)
def _insert_chattree(self, push): ancestor = -1 if bool(self.cache): ancestor = self.cache[self.cschema['tree_node']] try: data = { 'user_id': self.user[self.uschema['id']], 'ancestor': ancestor, 'query': self.query, 'keyword': self.keyword, 'reply': push, 'time': self.event_time, 'post': self.post_ref, 'push_num': len(self.push_pool) } psql = PsqlQuery() self.chat_tree_id = psql.insert_with_col_return( self.insert_chattree_sql, data) except Exception as e: self.logger.error('Insert ChatTree failed: {}'.format(e))
def clean_oldpost(self, batch_post, pschema): post_id = [p[pschema['id']] for p in batch_post] vocab2post, v2pschema = self._query_all( self.query_vocab2post_sql_by_post, (tuple(post_id), )) v2p_id = [v2p[v2pschema['id']] for v2p in vocab2post] vocab_id = list( {v2p[v2pschema['vocabulary_id']] for v2p in vocab2post}) psql = PsqlQuery() psql.delete(self.delete_vocab2post_sql, (tuple(v2p_id), )) psql.delete(self.delete_post_sql, (tuple(post_id), )) self._update_vocab_docfreq(vocab_id)
def __pipeline(spider_tag): sp.call(SPIDER_UPDATE.format(spider_tag).split()) r = sp.check_output(SPIDER_CRAWL.format(spider_tag).split()) filename = '{}.jl'.format(r.decode('utf-8').strip()) complete_filepath = '{}/{}'.format(SPIDER_OUTPUT_ROOT, filename) if not os.path.isfile(complete_filepath): logger.error( 'okbot cronjob: crawled file: {} not found, cronjob abort.'.format( complete_filepath)) return -1 else: sp.call(SPIDER_INGEST.format(complete_filepath, 'jieba').split()) logger.info( 'okbot cronjob: crawl/ingest: {} finished.'.format(filename)) if __name__ == '__main__': psql = PsqlQuery() allspiders = psql.query('SELECT tag, freq FROM crawl_app_spider;') schema = psql.schema for spider in allspiders: tag = spider[schema['tag']] freq = spider[schema['freq']] if freq > 0: delta = (datetime.datetime.today() - datetime.datetime(1970, 1, 1)).days if delta % freq == 0: __pipeline(tag) # __pipeline('Gossiping')
def _query_all(self, sql_string, data=None): psql = PsqlQuery() fetched = list(psql.query(sql_string, data)) schema = psql.schema return fetched, schema
def draw(self): psql = PsqlQuery() self.topic_post = list(psql.query(self.random_query_sql))[0] self.topic_pschema = psql.schema
def _query_vocab(self, tokenizer='jieba', w2v_model=None, jiebatag_weight={}): words = self.topic_post[self.topic_pschema['tokenized']].split() flags = self.topic_post[self.topic_pschema['grammar']].split() # self.tok, self.words, self.flags = Tokenizer(tokenizer).cut(self.post[self.pschema['title']]) vocab_name = [ '--+--'.join([w, f, tokenizer]) for w, f in zip(words, flags) ] vocab_score = {name: 1.0 for name in vocab_name} # Merge word2vec model here # =============================== if bool(w2v_model): try: w2v_query = [ '{}:{}'.format(w, f) for w, f in zip(words, flags) if f[0] in ['v', 'n'] or f in ['eng'] ] if bool(w2v_query): w2v_neighbor = w2v_model.most_similar(positive=w2v_query, topn=min( 3, len(w2v_query))) w2v_name = [ '--+--'.join('{}:{}'.format(w[0], tokenizer).split(':')) for w in w2v_neighbor ] w2v_score = [w[1] for w in w2v_neighbor] for name, score in zip(w2v_name, w2v_score): vocab_score[name] = score vocab_name.extend(w2v_name) except: self.logger.warning('word2vec query failed.') pass psql = PsqlQuery() qvocab = list(psql.query(self.query_vocab_sql, (tuple(vocab_name), ))) vschema = psql.schema _tag_weight = { q[vschema['tag']]: jiebatag_weight[q[vschema['tag']]]['weight'] if q[vschema['tag']] in jiebatag_weight else 1.0 for q in qvocab } # =============================== vocab = [{ 'word': ':'.join([q[vschema['word']], q[vschema['tag']]]), 'termweight': _tag_weight[q[vschema['tag']]] * vocab_score[q[vschema['name']]], 'docfreq': q[vschema['doc_freq']] } for q in qvocab] # keyword = json.dumps(vocab, indent=4, ensure_ascii=False, sort_keys=True) # self.logger.info(keyword) vid = [ q[vschema['id']] for q in qvocab if not (q[vschema['stopword']]) and q[vschema['doc_freq']] < self.vocab_docfreq_th ] return vocab, vid