def upsert_post(batch_post): timestamp = [post['timestamp'] for post in batch_post] _title = [post['title'] for post in batch_post] _url = [post['url'] for post in batch_post] _author = [post['author'] for post in batch_post] _content = [post['content'] for post in batch_post] _comment = [post['comment'] for post in batch_post] comment_len = [len(cmt) for cmt in _comment] sorted_idx = np.argsort(comment_len)[::-1] title, url, author, content, comment, publish_date = [], [], [], [], [], [] for idx in sorted_idx: if _url[idx] not in url: url.append(_url[idx]) title.append(_title[idx]) author.append(_author[idx]) content.append(_content[idx]) comment.append(_comment[idx]) publish_date.append(datetime.fromtimestamp(timestamp[idx])) post_id = [] try: psql = PsqlQuery() post_id = psql.upsert(upsert_post_sql, locals()) except Exception as e: oklogger.logger.error(e) oklogger.logger.error(title) raise e return post_id
def guery_vocab_group_by_comment_id(self, comment_id): psql = PsqlQuery() comment2vocab, schema = psql.query_all( self.guery_vocab_group_by_comment_id_sql, (tuple(comment_id), )) return comment2vocab, schema
def upsert_vocab2post(self, batch_post, post_id, vocab_bundle, vschema, tokenized_field='title_tokenized'): tokenized = [[(k['word'], k['pos'], self.tokenizer) for k in p[tokenized_field]] for p in batch_post] vocab2post = [] for vocab in vocab_bundle: vtubple = (vocab[vschema['word']], vocab[vschema['pos']], vocab[vschema['tokenizer']]) post_id_with_vocab = [ p for idx, p in enumerate(post_id) if vtubple in tokenized[idx] ] vocab2post.append([(vocab[vschema['id']], pid) for pid in post_id_with_vocab]) flatten_vocab2post = [tup for v2p in vocab2post for tup in v2p] vocabulary_id = [v2p[0] for v2p in flatten_vocab2post] flatten_post_id = [v2p[1] for v2p in flatten_vocab2post] psql = PsqlQuery() psql.upsert(self.upsert_vocab2post_sql, { 'vocabulary_id': vocabulary_id, 'post_id': flatten_post_id }) return vocabulary_id
def guery_vocab_group_by_title_id(self, title_id): psql = PsqlQuery() title2vocab, schema = psql.query_all( self.guery_vocab_group_by_title_id_sql, (tuple(title_id), )) return title2vocab, schema
def query_comment_by_post(self, post_id): bundle = [(id_, self.tokenizer_tag) for id_ in post_id] psql = PsqlQuery() comment, schema = psql.query_all(self.query_comment_by_unique_sql, (tuple(bundle), )) return comment, schema
def upsert_vocab2comment(self, batch_comment, comment_id, vocab_bundle, vschema, tokenized_field='comment_tokenized'): tokenized = [[(k['word'], k['pos'], self.tokenizer) for k in p[tokenized_field]] for p in batch_comment] vocab2comment = [] for vocab in vocab_bundle: vtuple = (vocab[vschema['word']], vocab[vschema['pos']], vocab[vschema['tokenizer']]) comment_id_with_vocab = [ cmt for idx, cmt in enumerate(comment_id) if vtuple in tokenized[idx] ] vocab2comment.append([(vocab[vschema['id']], cid) for cid in comment_id_with_vocab]) flatten_vocab2cmt = [tup for v2c in vocab2comment for tup in v2c] vocabulary_id = [v2c[0] for v2c in flatten_vocab2cmt] cmt_id = [v2c[1] for v2c in flatten_vocab2cmt] psql = PsqlQuery() psql.upsert(self.upsert_vocab2comment_sql, { 'vocabulary_id': vocabulary_id, 'comment_id': cmt_id }) return vocabulary_id
def update_title_quality(self, id_to_update, quality): psql = PsqlQuery() return psql.upsert(self.update_title_quality_sql, { 'id_': id_to_update, 'quality': quality })
def insert_title(self, batch_post, post_id, tokenized_field='title_tokenized', type_field='ctype'): num = len(batch_post) # qpost, pschema = self.query_post(post_url) tokenized = [ ' '.join([k['word'] for k in p[tokenized_field]]) for p in batch_post ] grammar = [ ' '.join([k['pos'] for k in p[tokenized_field]]) for p in batch_post ] # post_id = [p[pschema['id']] for p in post_bundle] ctype = [p[type_field] for p in batch_post] tokenizer = [self.tokenizer] * num retrieval_count = [0] * num quality = [0.0 for _ in range(num)] psql = PsqlQuery() title_id = psql.upsert(self.insert_title_sql, locals()) return [t[0] for t in title_id]
def draw_title(): psql = PsqlQuery() gtitle = psql.query(draw_random_title_sql) schema = psql.schema title = [] for t in gtitle: title.append(t) return title[0], schema
def query_comment_by_id(self, comment_id): psql = PsqlQuery() comment = psql.query(self.query_comment_by_id_sql, { 'id_': tuple(comment_id), 'tok': self.tokenizer_tag }) schema = psql.schema return comment, schema
def query_title_by_id(self, title_id): psql = PsqlQuery() title = psql.query(self.query_title_by_id_sql, { 'id_': tuple(title_id), 'tok': self.tokenizer_tag }) schema = psql.schema return title, schema
def insert_netizen(self, raw_name): name = list(set(raw_name)) num = len(name) quality = [0.0 for _ in range(num)] posts = [0 for _ in range(num)] comments = [0 for _ in range(num)] psql = PsqlQuery() ids = psql.upsert(self.insert_netizen_sql, locals()) return [i[0] for i in ids]
def update_association(self, postfreq_sum, commentfreq_sum, vocab_pairsum, vocab_ids, batch_size): qassociation, schema = self._query_all( self.query_association_by_vocabt_id, (tuple(vocab_ids), )) association_dict = {(i[schema['vocabt_id']], i[schema['vocabc_id']], i[schema['tokenizer']]): i[schema['pxy']] for i in qassociation} total_vocab_id = list( set( it.chain.from_iterable([[i[0], i[1]] for i in association_dict]))) if len(total_vocab_id) > 0: qvocab, schema = self._query_all(self.query_vocab_by_id_sql, (tuple(total_vocab_id), )) qvocab_dict = { v[schema['id']]: (v[schema['postfreq']], v[schema['commentfreq']]) for v in qvocab } vocabt_all = [] vocabc_all = [] npmi_all = [] confidence_all = [] tokenizer_all = [] for k, v in association_dict.items(): px = qvocab_dict[k[0]][0] / postfreq_sum py = qvocab_dict[k[1]][1] / commentfreq_sum pxy = v / vocab_pairsum vocabt_all.append(k[0]) vocabc_all.append(k[1]) npmi_all.append(self.normalized_pmi(px, py, pxy)) confidence_all.append(math.log(pxy / px)) tokenizer_all.append(k[2]) batch_vocabt = self.batch_list(vocabt_all, batch_size) batch_vocabc = self.batch_list(vocabc_all, batch_size) batch_tokenizer = self.batch_list(tokenizer_all, batch_size) batch_npmi = self.batch_list(npmi_all, batch_size) batch_confidence = self.batch_list(confidence_all, batch_size) for vocabt_id, vocabc_id, tokenizer, confidence, pmi in zip( batch_vocabt, batch_vocabc, batch_tokenizer, batch_confidence, batch_npmi): psql = PsqlQuery() psql.update( self.update_association_sql, { 'vocabt_id': vocabt_id, 'vocabc_id': vocabc_id, 'tokenizer': tokenizer, 'confidence': confidence, 'pmi': pmi })
def get_comment_obj(self, post_id): if not bool(post_id): return [] # Bottleneck ? comments, cmtschema = self.query_comment_by_post(post_id) # cmtid = [cmt[cmtschema['id']] for cmt in comments] cmt2vocab, c2vschema = self.guery_vocab_group_by_comment_id(cmtid) vid = list({ v for c2v in cmt2vocab for v in c2v[c2vschema['vocabulary_group']] }) if not bool(cmtid): return [] psql = PsqlQuery() cvocab, vschema = psql.query_all(self.query_vocab_by_id_sql, (tuple(vid), )) c2v_dict = { c2v[c2vschema['comment_id']]: c2v[c2vschema['vocabulary_group']] for c2v in cmt2vocab } v_dict = {v[vschema['id']]: v for v in cvocab} comment_objs = [] for i, cmt in enumerate(comments): if cmt[cmtschema['id']] not in self.excluded_comment_ids: if cmt[cmtschema['id']] in c2v_dict: vocabs = [ self._construct_vocab(v_dict[vid], vschema) for vid in c2v_dict[cmt[cmtschema['id']]] ] else: vocabs = [] comment_objs.append( Comment(vocabs, self.tokenizer_tag, post_id=cmt[cmtschema['post_id']], audience=cmt[cmtschema['audience_id']], quality=cmt[cmtschema['quality']], ctype=cmt[cmtschema['ctype']], retrieval_count=cmt[cmtschema['retrieval_count']], floor=cmt[cmtschema['floor']], id_=cmt[cmtschema['id']], body=''.join(cmt[cmtschema['tokenized']].split()))) if i > self.max_query_comment_num: break return comment_objs
def get_title_obj(self, vocab_id): if not bool(vocab_id): return [] # Bottleneck ? v2t, v2tschema = self.query_vocab2title(vocab_id) fltr_tid = [ q[v2tschema['title_id']] for q in v2t if q[v2tschema['title_id']] not in self.excluded_title_ids ] # title2vocab, t2vschema = self.guery_vocab_group_by_title_id(fltr_tid) tid = list({t2v[t2vschema['title_id']] for t2v in title2vocab}) vid = list({ v for t2v in title2vocab for v in t2v[t2vschema['vocabulary_group']] }) if not bool(tid): return [] title_generator, tschema = self.query_title_by_id(tid) psql = PsqlQuery() tvocab, vschema = psql.query_all(self.query_vocab_by_id_sql, (tuple(vid), )) t2v_dict = { t2v[t2vschema['title_id']]: t2v[t2vschema['vocabulary_group']] for t2v in title2vocab } v_dict = {v[vschema['id']]: v for v in tvocab} title_objs = [] for i, tt in enumerate(title_generator): if tt[tschema['post_id']] not in self.excluded_post_ids: vocabs = [ self._construct_vocab(v_dict[vid], vschema) for vid in t2v_dict[tt[tschema['id']]] ] title_objs.append( Title(vocabs, self.tokenizer_tag, post_id=tt[tschema['post_id']], quality=tt[tschema['quality']], ctype=tt[tschema['quality']], retrieval_count=tt[tschema['quality']], body=''.join(tt[tschema['tokenized']].split()), id_=tt[tschema['id']])) if i >= self.max_query_title_num: break return title_objs
def query_vocab_by_title_id(self, title_id): tid = list(set(title_id)) psql = PsqlQuery() vocab2title, schema = psql.query_all(self.query_vocab2post_by_tid_sql, (tuple(tid), )) vocab_id = list({v2t[schema['vocabulary_id']] for v2t in vocab2title}) vocab, vschema = psql.query_all(self.query_vocab_by_id, (tuple(vocab_id), )) return vocab, vschema
def query_vocab_by_post_id(self, post_id): pid = list(set(post_id)) psql = PsqlQuery() vocab2post, schema = psql.query_all(self.query_vocab2post_by_pid_sql, (tuple(pid), )) vocab_id = list({v2p[schema['vocabulary_id']] for v2p in vocab2post}) vocab, vschema = psql.query_all(self.query_vocab_by_id, (tuple(vocab_id), )) return vocab, vschema
def update_vocab_commentfreq(self, vocab_id): vocab_id = list(set(vocab_id)) qvocab2comment, schema = self._query_all( self.query_vocab2comment_by_vid_sql, (tuple(vocab_id), )) qvocab_id = [v2c[schema['vocabulary_id']] for v2c in qvocab2comment] vocab_cnt = collections.Counter(qvocab_id) freq = [vocab_cnt[id_] if id_ in vocab_cnt else 0 for id_ in vocab_id] psql = PsqlQuery() psql.update(self.update_vocab_commentfreq_sql, { 'id_': vocab_id, 'commentfreq': freq })
def insert_comment(self, comments, batch_field='comments', url_field='url', tokenized_field='comment_tokenized', type_field='ctype', floor_field='floor', audience_field='audience'): batch_comment = [] for batch in comments: batch_comment.extend(batch[batch_field]) post_url = [batch['url'] for batch in comments] if len(post_url) != len(set(post_url)): raise LengthNotMatchException num = len(batch_comment) qpost, pschema = self.query_post(post_url) tokenized = [ ' '.join([k['word'] for k in cmt[tokenized_field]]) for cmt in batch_comment ] grammar = [ ' '.join([k['pos'] for k in cmt[tokenized_field]]) for cmt in batch_comment ] ctype = [cmt[type_field] for cmt in batch_comment] floor = [cmt[floor_field] for cmt in batch_comment] audience = [cmt[audience_field] for cmt in batch_comment] self.insert_netizen(audience) tokenizer = [self.tokenizer] * num retrieval_count = [0] * num quality = [0.0 for _ in range(num)] post_id = [] try: for idx, (batch, p) in enumerate(zip(comments, qpost)): post_id.extend([p[pschema['id']]] * len(batch[batch_field])) except Exception as err: self.logger.error( 'It\'s impossible to insert Comments while Post doesn\'t exist. url: {}' .format(post_url[idx])) raise err psql = PsqlQuery() comment_id = psql.upsert(self.insert_comment_sql, locals()) return [cmt[0] for cmt in comment_id], batch_comment
def guery_vocab_group_by_title_using_vocab(self, vocab_id, ex_title_id): psql = PsqlQuery() if not bool(ex_title_id): ex_title_id = [-1] title2vocab, schema = psql.query_all( self.guery_vocab_group_by_title_using_vocab_id_sql, { 'vid': tuple(vocab_id), 'tid': tuple(ex_title_id) }) return title2vocab, schema
def query_vocab_id(batch_size=1000): sql = 'SELECT id FROM pttcorpus_vocabulary;' psql = PsqlQuery() vocabs = psql.query(sql) batch = [] i = 0 for v in vocabs: batch.append(v[0]) i += 1 if i > batch_size: i = 0 yield batch batch = [] yield batch
def query_vocab_by_comment_id(self, comment_id): cmtid = list(set(comment_id)) psql = PsqlQuery() vocab2comment, schema = psql.query_all( self.query_vocab2comment_by_cmtid_sql, (tuple(cmtid), )) vocab_id = list( {v2c[schema['vocabulary_id']] for v2c in vocab2comment}) vocab, vschema = psql.query_all(self.query_vocab_by_id, (tuple(vocab_id), )) return vocab, vschema
def upsert_post(self, batch_post, title_raw_field='title_raw', title_cleaned_field='title_cleaned', comment_raw_field='comment_raw', comment_cleaned_field='comment_cleaned', tag_field='tag', url_field='url', spider_field='spider', author_field='author', publish_date_field='date'): post_num = len(batch_post) title_raw = [p[title_raw_field] for p in batch_post] title_cleaned = [p[title_cleaned_field] for p in batch_post] comment_raw = [p[comment_raw_field] for p in batch_post] comment_cleaned = [p[comment_cleaned_field] for p in batch_post] url = [p[url_field] for p in batch_post] if len(url) != len(set(url)): raise LengthNotMatchException tag = [p[tag_field] for p in batch_post] author = [ p[author_field][:p[author_field].find('(')].strip() for p in batch_post ] self.insert_netizen(author) publish_date = [p[publish_date_field] for p in batch_post] spider = [p[spider_field] for p in batch_post] last_update = [datetime.now()] * post_num quality = [0.0 for _ in range(post_num)] update_count = [1] * post_num allow_update = [True] * post_num # qpost, schema = self.query_post(url) # for i, q in enumerate(qpost): # if q: # if len(q[schema['push']]) == len(push[i]): # allow_update[i] = False post_id = [] try: psql = PsqlQuery() post_id = psql.upsert(self.upsert_post_sql, locals()) except Exception as e: self.logger.error(e) raise e return [p[0] for p in post_id]
def query_vocab_by_words(self, wds, relative_words=None): words = list(wds) if bool(relative_words): try: words += list(relative_words) except Exception as err: self.logger.warning(err) bundle = [(w.word, w.pos, self.tokenizer_tag) for w in words] psql = PsqlQuery() qvocab, vschema = psql.query_all(self.query_vocab_sql, (tuple(bundle), )) return qvocab, vschema
def insert_vocab_ignore_docfreq(self, words): distinct = list({(w.word, w.pos) for w in words}) num = len(distinct) word = [d[0] for d in distinct] pos = [d[1] for d in distinct] tokenizer = [self.tokenizer for _ in range(num)] quality = [0.0 for _ in range(num)] titlefreq = [0 for _ in range(num)] contentfreq = [0 for _ in range(num)] commentfreq = [0 for _ in range(num)] stopword = [False for _ in range(num)] psql = PsqlQuery() vocab_bundle = psql.upsert(self.insert_vocab_sql, locals()) returned_schema = dict(id=0, word=1, pos=2, tokenizer=3) return vocab_bundle, returned_schema
def insert_vocab_ignore_docfreq(self, batch, tokenized_field='title_tokenized'): allpairs = [pair for body in batch for pair in body[tokenized_field]] distinct = list({(pair['word'], pair['pos']) for pair in allpairs}) num = len(distinct) word = [d[0] for d in distinct] pos = [d[1] for d in distinct] tokenizer = [self.tokenizer for _ in range(num)] quality = [0.0 for _ in range(num)] postfreq = [0 for _ in range(num)] # titlefreq = [0 for _ in range(num)] # contentfreq = [0 for _ in range(num)] commentfreq = [0 for _ in range(num)] stopword = [False for _ in range(num)] psql = PsqlQuery() vocab_bundle = psql.upsert(self.insert_vocab_sql, locals()) returned_schema = dict(id=0, word=1, pos=2, tokenizer=3) return vocab_bundle, returned_schema
def query_freq_sum(): query_freq_sum_sql = ''' SELECT SUM(postfreq) AS postfreq_sum, SUM(commentfreq) AS commentfreq_sum FROM pttcorpus_vocabulary; ''' query_vocab_pairfreq_sum_sql = ''' SELECT SUM(pxy) AS sum FROM pttcorpus_association; ''' psql = PsqlQuery() postfreq_sum, commentfreq_sum = list(psql.query(query_freq_sum_sql))[0] logger.info('postfreq_sum:{}, commentfreq_sum:{}'.format( postfreq_sum, commentfreq_sum)) vocab_pairfreq_sum = list(psql.query(query_vocab_pairfreq_sum_sql))[0][0] logger.info('vocab_pairfreq_sum:{}'.format(vocab_pairfreq_sum)) return postfreq_sum, commentfreq_sum, vocab_pairfreq_sum
def upsert_vocab_pairfreq(self, vocab_id, batch_size): vocab_id = list(set(vocab_id)) qpost, schema = self._query_all(self.query_post_by_vid_sql, (tuple(vocab_id), )) qpost_lists = [p2v[schema['post_id']] for p2v in qpost] if len(qpost_lists) > 0: cnter_result = self.vocab_pair_counter(qpost_lists) vocab_cnt = { vocab_pair: cnter_result[vocab_pair] for vocab_pair in cnter_result.keys() if int(vocab_pair[0]) in vocab_id } vocabt_all = [] vocabc_all = [] pxy_all = [] tokenizer_all = ['jieba'] * len(vocab_cnt) for k, v in vocab_cnt.items(): vocabt_all.append(int(k[0])) vocabc_all.append(int(k[1])) pxy_all.append(v) batch_vocabt = self.batch_list(vocabt_all, batch_size) batch_vocabc = self.batch_list(vocabc_all, batch_size) batch_pxy = self.batch_list(pxy_all, batch_size) batch_tokenizer = self.batch_list(tokenizer_all, batch_size) for vocabt_id, vocabc_id, tokenizer, pxy in zip( batch_vocabt, batch_vocabc, batch_tokenizer, batch_pxy): psql = PsqlQuery() psql.upsert( self.upsert_vocab_pairfreq_sql, { 'vocabt_id': vocabt_id, 'vocabc_id': vocabc_id, 'tokenizer': tokenizer, 'pxy': pxy })
def generate_random_post(ref): psql = PsqlQuery() posts = psql.query(query_random_post_sql) return [p[0] for p in posts][:len(ref)]
def extract_words(comments): if not bool(comments): return [] def extract(cmt): return [v for v in cmt.vocabs] return [extract(cmt) for cmt in comments] if __name__ == '__main__': with open('eval0829.csv', 'w') as f: f.write('random, base, pweight\n') psql = PsqlQuery() posts = psql.query(query_post_sql) pschema = psql.schema valid_post = 0 for idx, p in enumerate(posts): titles, tschema = psql.query_all( query_title_sql, dict(pid=p[pschema['id']], tok='jieba')) basic_retriever = RetrievalEvaluate( 'jieba', excluded_post_ids=[p[pschema['id']]], logger_name='retrieve') pweight_retriever = RetrievalEvaluate(