def draw_title(): psql = PsqlQuery() gtitle = psql.query(draw_random_title_sql) schema = psql.schema title = [] for t in gtitle: title.append(t) return title[0], schema
def query_comment_by_id(self, comment_id): psql = PsqlQuery() comment = psql.query(self.query_comment_by_id_sql, { 'id_': tuple(comment_id), 'tok': self.tokenizer_tag }) schema = psql.schema return comment, schema
def query_title_by_id(self, title_id): psql = PsqlQuery() title = psql.query(self.query_title_by_id_sql, { 'id_': tuple(title_id), 'tok': self.tokenizer_tag }) schema = psql.schema return title, schema
def query_freq_sum(): query_freq_sum_sql = ''' SELECT SUM(postfreq) AS postfreq_sum, SUM(commentfreq) AS commentfreq_sum FROM pttcorpus_vocabulary; ''' query_vocab_pairfreq_sum_sql = ''' SELECT SUM(pxy) AS sum FROM pttcorpus_association; ''' psql = PsqlQuery() postfreq_sum, commentfreq_sum = list(psql.query(query_freq_sum_sql))[0] logger.info('postfreq_sum:{}, commentfreq_sum:{}'.format( postfreq_sum, commentfreq_sum)) vocab_pairfreq_sum = list(psql.query(query_vocab_pairfreq_sum_sql))[0][0] logger.info('vocab_pairfreq_sum:{}'.format(vocab_pairfreq_sum)) return postfreq_sum, commentfreq_sum, vocab_pairfreq_sum
def query_vocab_id(batch_size=1000): sql = 'SELECT id FROM pttcorpus_vocabulary;' psql = PsqlQuery() vocabs = psql.query(sql) batch = [] i = 0 for v in vocabs: batch.append(v[0]) i += 1 if i > batch_size: i = 0 yield batch batch = [] yield batch
def generate_random_post(ref): psql = PsqlQuery() posts = psql.query(query_random_post_sql) return [p[0] for p in posts][:len(ref)]
def extract_words(comments): if not bool(comments): return [] def extract(cmt): return [v for v in cmt.vocabs] return [extract(cmt) for cmt in comments] if __name__ == '__main__': with open('eval0829.csv', 'w') as f: f.write('random, base, pweight\n') psql = PsqlQuery() posts = psql.query(query_post_sql) pschema = psql.schema valid_post = 0 for idx, p in enumerate(posts): titles, tschema = psql.query_all( query_title_sql, dict(pid=p[pschema['id']], tok='jieba')) basic_retriever = RetrievalEvaluate( 'jieba', excluded_post_ids=[p[pschema['id']]], logger_name='retrieve') pweight_retriever = RetrievalEvaluate( 'jieba',
def query_post_by_id(self, post_id): psql = PsqlQuery() post = psql.query(self.query_post_by_id_sql, (tuple(post_id), )) schema = psql.schema return post, schema