def upsert_vocab2comment(self, batch_comment, comment_id, vocab_bundle, vschema, tokenized_field='comment_tokenized'): tokenized = [[(k['word'], k['pos'], self.tokenizer) for k in p[tokenized_field]] for p in batch_comment] vocab2comment = [] for vocab in vocab_bundle: vtuple = (vocab[vschema['word']], vocab[vschema['pos']], vocab[vschema['tokenizer']]) comment_id_with_vocab = [ cmt for idx, cmt in enumerate(comment_id) if vtuple in tokenized[idx] ] vocab2comment.append([(vocab[vschema['id']], cid) for cid in comment_id_with_vocab]) flatten_vocab2cmt = [tup for v2c in vocab2comment for tup in v2c] vocabulary_id = [v2c[0] for v2c in flatten_vocab2cmt] cmt_id = [v2c[1] for v2c in flatten_vocab2cmt] psql = PsqlQuery() psql.upsert(self.upsert_vocab2comment_sql, { 'vocabulary_id': vocabulary_id, 'comment_id': cmt_id }) return vocabulary_id
def upsert_vocab2post(self, batch_post, post_id, vocab_bundle, vschema, tokenized_field='title_tokenized'): tokenized = [[(k['word'], k['pos'], self.tokenizer) for k in p[tokenized_field]] for p in batch_post] vocab2post = [] for vocab in vocab_bundle: vtubple = (vocab[vschema['word']], vocab[vschema['pos']], vocab[vschema['tokenizer']]) post_id_with_vocab = [ p for idx, p in enumerate(post_id) if vtubple in tokenized[idx] ] vocab2post.append([(vocab[vschema['id']], pid) for pid in post_id_with_vocab]) flatten_vocab2post = [tup for v2p in vocab2post for tup in v2p] vocabulary_id = [v2p[0] for v2p in flatten_vocab2post] flatten_post_id = [v2p[1] for v2p in flatten_vocab2post] psql = PsqlQuery() psql.upsert(self.upsert_vocab2post_sql, { 'vocabulary_id': vocabulary_id, 'post_id': flatten_post_id }) return vocabulary_id
def insert_title(self, batch_post, post_id, tokenized_field='title_tokenized', type_field='ctype'): num = len(batch_post) # qpost, pschema = self.query_post(post_url) tokenized = [ ' '.join([k['word'] for k in p[tokenized_field]]) for p in batch_post ] grammar = [ ' '.join([k['pos'] for k in p[tokenized_field]]) for p in batch_post ] # post_id = [p[pschema['id']] for p in post_bundle] ctype = [p[type_field] for p in batch_post] tokenizer = [self.tokenizer] * num retrieval_count = [0] * num quality = [0.0 for _ in range(num)] psql = PsqlQuery() title_id = psql.upsert(self.insert_title_sql, locals()) return [t[0] for t in title_id]
def update_title_quality(self, id_to_update, quality): psql = PsqlQuery() return psql.upsert(self.update_title_quality_sql, { 'id_': id_to_update, 'quality': quality })
def upsert_post(batch_post): timestamp = [post['timestamp'] for post in batch_post] _title = [post['title'] for post in batch_post] _url = [post['url'] for post in batch_post] _author = [post['author'] for post in batch_post] _content = [post['content'] for post in batch_post] _comment = [post['comment'] for post in batch_post] comment_len = [len(cmt) for cmt in _comment] sorted_idx = np.argsort(comment_len)[::-1] title, url, author, content, comment, publish_date = [], [], [], [], [], [] for idx in sorted_idx: if _url[idx] not in url: url.append(_url[idx]) title.append(_title[idx]) author.append(_author[idx]) content.append(_content[idx]) comment.append(_comment[idx]) publish_date.append(datetime.fromtimestamp(timestamp[idx])) post_id = [] try: psql = PsqlQuery() post_id = psql.upsert(upsert_post_sql, locals()) except Exception as e: oklogger.logger.error(e) oklogger.logger.error(title) raise e return post_id
def insert_netizen(self, raw_name): name = list(set(raw_name)) num = len(name) quality = [0.0 for _ in range(num)] posts = [0 for _ in range(num)] comments = [0 for _ in range(num)] psql = PsqlQuery() ids = psql.upsert(self.insert_netizen_sql, locals()) return [i[0] for i in ids]
def insert_comment(self, comments, batch_field='comments', url_field='url', tokenized_field='comment_tokenized', type_field='ctype', floor_field='floor', audience_field='audience'): batch_comment = [] for batch in comments: batch_comment.extend(batch[batch_field]) post_url = [batch['url'] for batch in comments] if len(post_url) != len(set(post_url)): raise LengthNotMatchException num = len(batch_comment) qpost, pschema = self.query_post(post_url) tokenized = [ ' '.join([k['word'] for k in cmt[tokenized_field]]) for cmt in batch_comment ] grammar = [ ' '.join([k['pos'] for k in cmt[tokenized_field]]) for cmt in batch_comment ] ctype = [cmt[type_field] for cmt in batch_comment] floor = [cmt[floor_field] for cmt in batch_comment] audience = [cmt[audience_field] for cmt in batch_comment] self.insert_netizen(audience) tokenizer = [self.tokenizer] * num retrieval_count = [0] * num quality = [0.0 for _ in range(num)] post_id = [] try: for idx, (batch, p) in enumerate(zip(comments, qpost)): post_id.extend([p[pschema['id']]] * len(batch[batch_field])) except Exception as err: self.logger.error( 'It\'s impossible to insert Comments while Post doesn\'t exist. url: {}' .format(post_url[idx])) raise err psql = PsqlQuery() comment_id = psql.upsert(self.insert_comment_sql, locals()) return [cmt[0] for cmt in comment_id], batch_comment
def upsert_vocab_pairfreq(self, vocab_id, batch_size): vocab_id = list(set(vocab_id)) qpost, schema = self._query_all(self.query_post_by_vid_sql, (tuple(vocab_id), )) qpost_lists = [p2v[schema['post_id']] for p2v in qpost] if len(qpost_lists) > 0: cnter_result = self.vocab_pair_counter(qpost_lists) vocab_cnt = { vocab_pair: cnter_result[vocab_pair] for vocab_pair in cnter_result.keys() if int(vocab_pair[0]) in vocab_id } vocabt_all = [] vocabc_all = [] pxy_all = [] tokenizer_all = ['jieba'] * len(vocab_cnt) for k, v in vocab_cnt.items(): vocabt_all.append(int(k[0])) vocabc_all.append(int(k[1])) pxy_all.append(v) batch_vocabt = self.batch_list(vocabt_all, batch_size) batch_vocabc = self.batch_list(vocabc_all, batch_size) batch_pxy = self.batch_list(pxy_all, batch_size) batch_tokenizer = self.batch_list(tokenizer_all, batch_size) for vocabt_id, vocabc_id, tokenizer, pxy in zip( batch_vocabt, batch_vocabc, batch_tokenizer, batch_pxy): psql = PsqlQuery() psql.upsert( self.upsert_vocab_pairfreq_sql, { 'vocabt_id': vocabt_id, 'vocabc_id': vocabc_id, 'tokenizer': tokenizer, 'pxy': pxy })
def upsert_post(self, batch_post, title_raw_field='title_raw', title_cleaned_field='title_cleaned', comment_raw_field='comment_raw', comment_cleaned_field='comment_cleaned', tag_field='tag', url_field='url', spider_field='spider', author_field='author', publish_date_field='date'): post_num = len(batch_post) title_raw = [p[title_raw_field] for p in batch_post] title_cleaned = [p[title_cleaned_field] for p in batch_post] comment_raw = [p[comment_raw_field] for p in batch_post] comment_cleaned = [p[comment_cleaned_field] for p in batch_post] url = [p[url_field] for p in batch_post] if len(url) != len(set(url)): raise LengthNotMatchException tag = [p[tag_field] for p in batch_post] author = [ p[author_field][:p[author_field].find('(')].strip() for p in batch_post ] self.insert_netizen(author) publish_date = [p[publish_date_field] for p in batch_post] spider = [p[spider_field] for p in batch_post] last_update = [datetime.now()] * post_num quality = [0.0 for _ in range(post_num)] update_count = [1] * post_num allow_update = [True] * post_num # qpost, schema = self.query_post(url) # for i, q in enumerate(qpost): # if q: # if len(q[schema['push']]) == len(push[i]): # allow_update[i] = False post_id = [] try: psql = PsqlQuery() post_id = psql.upsert(self.upsert_post_sql, locals()) except Exception as e: self.logger.error(e) raise e return [p[0] for p in post_id]
def insert_vocab_ignore_docfreq(self, words): distinct = list({(w.word, w.pos) for w in words}) num = len(distinct) word = [d[0] for d in distinct] pos = [d[1] for d in distinct] tokenizer = [self.tokenizer for _ in range(num)] quality = [0.0 for _ in range(num)] titlefreq = [0 for _ in range(num)] contentfreq = [0 for _ in range(num)] commentfreq = [0 for _ in range(num)] stopword = [False for _ in range(num)] psql = PsqlQuery() vocab_bundle = psql.upsert(self.insert_vocab_sql, locals()) returned_schema = dict(id=0, word=1, pos=2, tokenizer=3) return vocab_bundle, returned_schema
def insert_vocab_ignore_docfreq(self, batch, tokenized_field='title_tokenized'): allpairs = [pair for body in batch for pair in body[tokenized_field]] distinct = list({(pair['word'], pair['pos']) for pair in allpairs}) num = len(distinct) word = [d[0] for d in distinct] pos = [d[1] for d in distinct] tokenizer = [self.tokenizer for _ in range(num)] quality = [0.0 for _ in range(num)] postfreq = [0 for _ in range(num)] # titlefreq = [0 for _ in range(num)] # contentfreq = [0 for _ in range(num)] commentfreq = [0 for _ in range(num)] stopword = [False for _ in range(num)] psql = PsqlQuery() vocab_bundle = psql.upsert(self.insert_vocab_sql, locals()) returned_schema = dict(id=0, word=1, pos=2, tokenizer=3) return vocab_bundle, returned_schema
def update_vocab_quality(self, word, quality): psql = PsqlQuery() return psql.upsert(self.update_vocab_quality_sql, { 'word': word, 'quality': quality })