def main(): list = s3manager._s3_list() for key in [x for x in list if '3/' in x]: args = s3manager._s3_get_object(key) stmt_c = 'insert ignore into nlp_isam.comments_isam values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' stmt_u = 'insert ignore into nlp_isam.users_isam values (%s, %s, %s, %s, %s, %s, %s)' if 'users/' in key: with DB() as db: db.custom_put_many(stmt_u, args) elif 'comments/' in key: with DB() as db: db.custom_put_many(stmt_c, args) print('Added {} to Db'.format(key))
def process_xar(self, **kwargs): response.headers['Content-Type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' index = kwargs['index'] xar = kwargs['xar'] xar_list = kwargs['xar_list'] note = kwargs['note'] #print(kwargs) rubric_id = kwargs['rubric_id'] chefdoeuvre = 1 if 'chefdoeuvre' in kwargs.keys() else 0 xar = xar_list if xar_list else xar idx = '{}_{}'.format(rubric_id, index) with DB() as db: db.custom_put( 'update id_rubric set characteristic=%s, note=%s, is_chefdoeuvre=%s where idx=%s', (xar, note, chefdoeuvre, idx)) total = db.custom_get( 'select count(*) from id_rubric where rubric_id=%s and is_deleted=0', (rubric_id, ))[0][0] done = db.custom_get( 'select count(*) from id_rubric where rubric_id=%s and characteristic is not null', (rubric_id, ))[0][0] xars = db.custom_get( 'select distinct characteristic from id_rubric where rubric_id=%s', (rubric_id, )) xars = [x[0] for x in xars if x[0]] resp = {'done': done, 'total': total, 'xars': xars} return json.dumps(resp).encode('utf-8')
def db2csv(): with DB() as db: data = db.custom_get( ''' select wtf.user, wtf.bdate, wtf.date, scores.score_1 as sentiment, #wtf.city, scores.keywords_full as keywords, wtf.text from scores join (select stalin.id as comment_id, stalin.date, stalin.text, users.id as user, users.* from stalin join users on stalin.user_id=users.id) as wtf on scores.comment_id=wtf.comment_id where score_1 is not null order by wtf.date''', ()) lines = [] lines.append('{}\n'.format('$$$'.join([ 'user_id', 'birthday', 'datetime', 'sentiment score', 'keywords', 'text' ]))) for row in data: #lines.append('{}\n'.format('$$$'.join([sub('\W', ' ', str(x)) for x in row]))) lines.append('{}\n'.format(sub('\n', ' ', row[-1]))) with open('output_text.txt', 'w', encoding='utf-8') as f: f.write(''.join(lines))
def get_rubric(self, rubric_id=None): response.headers['Content-Type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' rubric_id = int(rubric_id) #ddf = df[df.rubric_id==rubric_id].sort_values(by=['likes'], ascending=False).drop_duplicates() ddf = df[df.rubric_id == rubric_id].drop_duplicates() #print(len(ddf)) ddf['age'] = (ddf.date - ddf.bdate).dt.days / 365 ddf['current'] = [x + 1 for x in range(len(ddf))] group_name = ddf.group.unique()[0] max_likes = int(max(ddf.likes)) mean_likes = int(np.mean(ddf.likes)) total_raw = len(ddf) with DB() as db: deleted = db.custom_get( 'select comment_id from id_rubric where rubric_id=%s and is_deleted=1', (rubric_id, )) deleted = [x[0] for x in deleted] done = db.custom_get( 'select comment_id from id_rubric where rubric_id=%s and characteristic is not null', (rubric_id, )) done = [x[0] for x in done] xars = db.custom_get( 'select distinct characteristic from id_rubric where rubric_id=%s', (rubric_id, )) xars = [x[0] for x in xars if x[0]] ddf = ddf.drop(pd.Series(deleted), axis=0) ddf = ddf.drop(pd.Series(done), axis=0) ddf = ddf.reset_index() ddf.date = ddf.date.dt.strftime('%Y-%m-%d') ddf = ddf.drop('bdate', axis=1) rubric_name = pd.read_pickle('rubric_names.pickle')[rubric_id] #group_name = ddf.group.unique()[0] #ddf_js = ddf.head().to_json(orient='values') #whole = json.loads(json.dumps(ddf.to_dict('records')[:300]).encode('utf-8').decode('utf-8', errors='ignore')) whole = ddf[:300].to_json(orient='records') obj = { 'rubric_id': rubric_id, 'rubric_name': rubric_name, 'group_name': group_name, 'done_count': len(done), 'total_count': total_raw, 'likes_max': max_likes, 'likes_mean': mean_likes, 'xars_list': xars, 'whole_data': whole } return json.dumps(obj).encode('utf-8')
def main(year): stop_words = stopwords.words('russian') stop_words.extend([ 'что', 'че', 'чё' 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', 'если', '""', "''", '``' ]) with DB() as db: comments = db.custom_get( '''select * from {0}_100k where date > {1}0101 and date < {1}1231''' .format(category, year), ()) total_comments = len(comments) result = [] for row in comments: #polarity = row[2] comment_id = row[0] comment = row[5].lower() comment = sub('\[id[0-9]*?\|.*?\]', '', comment) tokens = nltk.word_tokenize(comment) tokens = [i for i in tokens if (i not in string.punctuation)] #Eliminate punctuation tokens = [i for i in tokens if (i not in stop_words)] #Eliminate stop_words tokens = [sub('\W', '', i) for i in tokens] #Eliminate non-words tokens = [i for i in tokens if i] #Eliminate '' and "" tokens_pos = nltk.pos_tag( tokens, lang='rus') # Detect POS of tokens and return tuple (token, POS) tokens = [ i[0] for i in tokens_pos if (i[1] in ['V', 'S'] or 'A=' in i[1] or 'NUM' in i[1]) ] #Return only nouns and verbs ??? #root_idxs = [tokens.index(i) for i in tokens if root_word in i] #tokens = [tokens[i-n:i+n] for i in root_idxs] # Flatten token lists of lists ''' flatten = [] for i in tokens: for x in i: flatten.append(x) tokens = flatten ''' #tokens = [i for i in sublist for sublist in tokens] #normal_tokens = [morph.parse(i)[0].inflect({'sing', 'nomn'}).word for i in tokens] normal_tokens = [morph.normal_forms(i)[0] for i in tokens] #Lemmetize (returns normal form result += normal_tokens #result.append({'id': comment_id, 'text': comment, 'tokens': normal_tokens, 'polarity': polarity}) return result, total_comments
def main(mode='keyPhrases'): comments = get_comments('сталин') seen = set() comms = [] for comm in comments: if comm[0] not in seen: comms.append((comm[1], comm[2])) seen.add(comm[0]) result = {} for n_sentence in ['1', '2', 'full']: comm_list = [] req_list = [] for comm in comms: comm_dict = dict( zip(['language', 'id', 'text'], [ 'ru', comm[0], '.'.join( sub('\[id[0-9]*?\|.*?\]', '', comm[1]).split('.')[:int(n_sentence)]) if n_sentence != 'full' else sub('\[id[0-9]*?\|.*?\]', '', comm[1]) ])) if len(str(comm_list)) + len( str(comm_dict)) < 524288 - 20 and len(comm_list) < 1000: comm_list.append(comm_dict) else: req_list.append(json.dumps({'documents': comm_list})) comm_list = [] continue req_list.append(json.dumps({'documents': comm_list})) resp_list = [] for req in req_list: resp_list.append( requests.post(URL.format(mode), data=req, headers=header).json()) result[n_sentence] = resp_list with DB() as db: for key, resp_list in result.items(): col = 'score_{}'.format( key) if 'sentiment' in mode else 'keywords_{}'.format(key) for resp in resp_list: db.custom_put_many( 'update scores set {}=%s where comment_id=%s'.format(col), tuple( zip([ x['score'] if 'sentiment' in mode else '; '.join( x['keyPhrases']) for x in resp['documents'] ], [x['id'] for x in resp['documents']])))
def main(group): with DB() as db: done = db.get_done() doneset = set(x[0] for x in done) try: posts_count = wall_get(group)['count'] #posts_count = 20 except Exception as e: print(e) raise e for n in range(posts_count // 100 + 1): try: offset = n*100 posts = wall_get(group, offset=offset) except Exception as e: print(e) raise e for post in [x for x in posts['items'] if x['comments']['count'] > 0 and '{}_{}'.format(group, x['id']) not in doneset]: try: comments_count = post['comments']['count'] except Exception as e: print(e) raise e for n in range(comments_count // 100 + 1): try: offset = n*100 comments = wall_get_comments(group, post['id'], offset=offset) except Exception as e: print(e) raise e with DB() as db: db.add_comments(group, post['id'], comments['items']) db.add_users(comments['profiles']) db.add_done(group, post['id'], comments_count, offset+len(comments['items']))
def main_stored(group): with DB() as db: offset = db.get_next_offset(group)[0][0] if not offset: offset = db.get_done_by_group(group)[0][0] offset = 0 #posts_count = wall_get(group)['count'] #posts_count = 45000 #while offset/posts_count < 0.90: while offset < 1000: response = execute_get_comments(group, offset=offset) if not response: offset += 1 print('Skipped request..') continue #break with DB() as db: for item in response['items']: #db.add_comments(abs(int(item['group'])), int(item['post']), item['comments']['items']) DB.add_comments_json(abs(int(item['group'])), int(item['post']), item['comments']['items']) #db.add_users(item['comments']['profiles']) DB.add_users_json(item['comments']['profiles']) db.add_done(abs(int(item['group'])), int(item['post']), item['comments']['count'], len(item['comments']['items'])) offset = response['next_offset'] #db.put_next_offset(offset, group) #diff = (int(response['items'][-1]['post']) - int(response['items'][0]['post'])) // len(response['items']) print('{} --- Acquired comments for posts upto: {} ({}). Next_offset is: {}. Group: {}'.format(datetime.now(), response['items'][-1]['post'], len(response['items']), response['next_offset'], response['items'][0]['group']))
def populate_rubrics(): for n in range(1, len(rubrics) + 1): key = 'rubrics/old/{}.json'.format(n) js = s3manager._s3_get_object(key) rubric_num = n rubric_name = js['rubric'] ids = [x[0][0] for x in js['comments']] with DB() as db: db.custom_put_many( 'insert ignore into id_rubric_old (idx, comment_id, rubric_id) values (%s, %s, %s)', tuple( zip(['{}_{}'.format(n, x) for x in ids], [x for x in ids], [n for x in ids])))
def submit_feedback(self, **kwargs): response.headers['Content-Type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' #payload = json.loads(request.body.read().decode('utf-8')) feedback = kwargs['feedback'] rubric_id = kwargs['rubric_id'] if feedback: with DB() as db: db.custom_put( 'insert ignore into rubric_feedback values (%s, %s)', (rubric_id, feedback)) return json.dumps({ 'ok': True, 'feedback': feedback, 'rubric_id': rubric_id }).encode('utf-8')
def process_polarity(self, **kwargs): response.headers['Content-Type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' index = kwargs['index'] rubric_id = kwargs['rubric_id'] idx = '{}_{}'.format(rubric_id, index) polarity = kwargs['polarity'] with DB() as db: db.custom_put('update id_rubric set polarity=%swhere idx=%s', (polarity, idx)) total = db.custom_get( 'select count(*) from id_rubric where rubric_id=%s and is_deleted=0', (rubric_id, ))[0][0] done = db.custom_get( 'select count(*) from id_rubric where rubric_id=%s and polarity is not null', (rubric_id, ))[0][0] resp = {'done': done, 'total': total} return json.dumps(resp).encode('utf-8')
def get_comments(rubric): rubric_comments = [] for expr in rubric: expr_lower = expr.lower() expr_upper = expr.upper() expr_cap = expr.capitalize() stmt = '''select * from nlp_isam.comments_isam_old_text where match(text) against(%s in boolean mode)''' #or #match(text) against(%s in boolean mode) or #match(text) against(%s in boolean mode) or #match(text) against(%s in boolean mode)''' args = (expr, ) #, expr_lower, expr_upper, expr_cap) with DB() as db: res = db.custom_get(stmt, args) def getmeta(comm_id): return db.custom_get( '''select user_id, reply_to_user is not null as isreply from comments_isam_old join users_isam on comments_isam_old.user_id=users_isam.id where comments_isam_old.id=%s''', (comm_id, ))[0] resmeta = [[x] + list(getmeta(x[0])) for x in res] def filter_unique(resmeta): seen = set() result = [] for x in resmeta: if x[-2] not in seen: result.append(x) seen.add(x[-2]) return result def filter_primary(resmeta): return [x for x in resmeta if x[-1] == 0] resmeta = filter_primary(resmeta) #rubric_comments += filter_unique(resmeta) rubric_comments += filter_unique(resmeta) #del res print('Acquired comments for rubric: {}'.format(expr)) return rubric_comments
from nlpdb import DB from acquire_comments import groups_getById as get_names import json db = DB() groups_raw = db.custom_get('select * from last_offsets', ()) groups_total = [(x[0], x[-1]) for x in groups_raw] group_ids = ','.join([str(x[0]) for x in groups_raw]) group_names = get_names(group_ids) groups = [(y['id'], y['name'], [x[-1] for x in groups_raw if x[0] == y['id']][0]) for y in group_names] COUNTS = [] for group in groups: group_name = group[1] group_id = group[0] total = group[-1] count = db.custom_get( 'select count(*) from nlp_isam.comments_isam where group_id=%s', (group_id, ))[0][0] posts = db.custom_get( 'select count(distinct post_id) from nlp_isam.comments_isam where group_id=%s', (group_id, ))[0][0] count_2017 = db.custom_get( 'select count(*) from nlp_isam.comments_isam where group_id=%s and date > "20170101" and date < "20171231"', (group_id, ))[0][0] count_2016 = db.custom_get( 'select count(*) from nlp_isam.comments_isam where group_id=%s and date > "20160101" and date < "20161231"', (group_id, ))[0][0] count_2015 = db.custom_get(
def take_15_300(): with DB() as db: first_15 = db.custom_get('select rubric_id from ')
with DB() as db: for item in response['items']: #db.add_comments(abs(int(item['group'])), int(item['post']), item['comments']['items']) DB.add_comments_json(abs(int(item['group'])), int(item['post']), item['comments']['items']) #db.add_users(item['comments']['profiles']) DB.add_users_json(item['comments']['profiles']) db.add_done(abs(int(item['group'])), int(item['post']), item['comments']['count'], len(item['comments']['items'])) offset = response['next_offset'] #db.put_next_offset(offset, group) #diff = (int(response['items'][-1]['post']) - int(response['items'][0]['post'])) // len(response['items']) print('{} --- Acquired comments for posts upto: {} ({}). Next_offset is: {}. Group: {}'.format(datetime.now(), response['items'][-1]['post'], len(response['items']), response['next_offset'], response['items'][0]['group'])) if __name__ == '__main__': with DB() as db: groups = db.custom_get('select last_offsets.group from last_offsets where offset/total < 0.8 or offset is null', ()) groups = [x[0] for x in groups] #groups = groups_get()['items'] #done = {24199209, 26284064, 40316705} #groups = [x for x in groups if x not in done] #groups = [76982440] for group in groups: main_stored(group) ''' from multiprocessing import Pool with Pool(2) as pool: pool.map(main_stored, [x for x in groups]) '''
def main_isp(): KEY = '63e75cbccf7a1cbd88258c5217ca59331310d2f3' URL = 'http://api.ispras.ru/texterra/v1/nlp?' url = URL + 'targetType=polarity&tweet=true&apikey={}'.format(KEY) comments = get_comments() mode = 'polarity' seen = set() comms = [] for comm in comments: if comm[0] not in seen: comms.append((comm[1], sub('\[id[0-9]*?\|.*?\]', '', comm[2]))) seen.add(comm[0]) result = {} for n_sentence in ['full']: comm_list = [] req_list = [] for comm in comms: comm_dict = {'text': comm[1], 'id': comm[0]} if len(str(comm_list)) + len( str(comm_dict)) < 524288 - 20 and len(comm_list) < 50: comm_list.append(comm_dict) else: req_list.append(json.dumps(comm_list)) comm_list = [] continue req_list.append(json.dumps(comm_list)) resp_list = [] for req in req_list: _req = json.loads(req) texts = [x['text'] for x in _req] resp = requests.post(url, data=req, headers={ 'Content-Type': 'application/json', 'Accept': 'application/json' }).json() r = [{ 'annotations': x['annotations'], 'id': _req[texts.index(x['text'])]['id'] } for x in resp] resp_list.append(r) result[n_sentence] = resp_list with DB() as db: for key, resp_list in result.items(): #col = 'score_{}'.format(key) if 'sentiment' in mode else 'keywords_{}'.format(key) col = 'polarity' if 'polarity' in mode else 'keywords' for resp in resp_list: args = tuple( zip([ decide_polarity( x['annotations']['polarity'][0]['value']) if len(x['annotations']) > 0 else 0 for x in resp ], [x['id'] for x in resp])) db.custom_put_many( 'update scores_isp set {}=%s where comment_id=%s'.format( col), args)
def get_comments(): with DB() as db: comments = db.custom_get( 'select users.id, stalin.id, stalin.text from users join stalin on users.id=stalin.user_id', ()) return comments