Beispiel #1
0
def main():
    list = s3manager._s3_list()
    for key in [x for x in list if '3/' in x]:
        args = s3manager._s3_get_object(key)
        stmt_c = 'insert ignore into nlp_isam.comments_isam values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
        stmt_u = 'insert ignore into nlp_isam.users_isam values (%s, %s, %s, %s, %s, %s, %s)'
        if 'users/' in key:
            with DB() as db:
                db.custom_put_many(stmt_u, args)
        elif 'comments/' in key:
            with DB() as db:
                db.custom_put_many(stmt_c, args)
        print('Added {} to Db'.format(key))
    def process_xar(self, **kwargs):
        response.headers['Content-Type'] = 'application/json'
        response.headers['Access-Control-Allow-Origin'] = '*'

        index = kwargs['index']
        xar = kwargs['xar']
        xar_list = kwargs['xar_list']
        note = kwargs['note']
        #print(kwargs)
        rubric_id = kwargs['rubric_id']
        chefdoeuvre = 1 if 'chefdoeuvre' in kwargs.keys() else 0

        xar = xar_list if xar_list else xar

        idx = '{}_{}'.format(rubric_id, index)

        with DB() as db:
            db.custom_put(
                'update id_rubric set characteristic=%s, note=%s, is_chefdoeuvre=%s where idx=%s',
                (xar, note, chefdoeuvre, idx))
            total = db.custom_get(
                'select count(*) from id_rubric where rubric_id=%s and is_deleted=0',
                (rubric_id, ))[0][0]
            done = db.custom_get(
                'select count(*) from id_rubric where rubric_id=%s and characteristic is not null',
                (rubric_id, ))[0][0]
            xars = db.custom_get(
                'select distinct characteristic from id_rubric where rubric_id=%s',
                (rubric_id, ))

            xars = [x[0] for x in xars if x[0]]

        resp = {'done': done, 'total': total, 'xars': xars}

        return json.dumps(resp).encode('utf-8')
Beispiel #3
0
def db2csv():
    with DB() as db:
        data = db.custom_get(
            '''
                            select wtf.user,
                            wtf.bdate,
                            wtf.date,
                            scores.score_1 as sentiment,
                            #wtf.city,
                            scores.keywords_full as keywords,
                            wtf.text
                                from scores
                                    join
                                        (select stalin.id as comment_id, stalin.date, stalin.text, users.id as user, users.* from stalin join users on stalin.user_id=users.id)
                                            as wtf
                                    on scores.comment_id=wtf.comment_id
                                        where score_1 is not null
                                            order by wtf.date''', ())

    lines = []
    lines.append('{}\n'.format('$$$'.join([
        'user_id', 'birthday', 'datetime', 'sentiment score', 'keywords',
        'text'
    ])))
    for row in data:
        #lines.append('{}\n'.format('$$$'.join([sub('\W', ' ', str(x)) for x in row])))
        lines.append('{}\n'.format(sub('\n', ' ', row[-1])))
    with open('output_text.txt', 'w', encoding='utf-8') as f:
        f.write(''.join(lines))
    def get_rubric(self, rubric_id=None):
        response.headers['Content-Type'] = 'application/json'
        response.headers['Access-Control-Allow-Origin'] = '*'
        rubric_id = int(rubric_id)
        #ddf = df[df.rubric_id==rubric_id].sort_values(by=['likes'], ascending=False).drop_duplicates()
        ddf = df[df.rubric_id == rubric_id].drop_duplicates()
        #print(len(ddf))
        ddf['age'] = (ddf.date - ddf.bdate).dt.days / 365
        ddf['current'] = [x + 1 for x in range(len(ddf))]
        group_name = ddf.group.unique()[0]
        max_likes = int(max(ddf.likes))
        mean_likes = int(np.mean(ddf.likes))

        total_raw = len(ddf)

        with DB() as db:
            deleted = db.custom_get(
                'select comment_id from id_rubric where rubric_id=%s and is_deleted=1',
                (rubric_id, ))
            deleted = [x[0] for x in deleted]
            done = db.custom_get(
                'select comment_id from id_rubric where rubric_id=%s and characteristic is not null',
                (rubric_id, ))
            done = [x[0] for x in done]

            xars = db.custom_get(
                'select distinct characteristic from id_rubric where rubric_id=%s',
                (rubric_id, ))

            xars = [x[0] for x in xars if x[0]]

        ddf = ddf.drop(pd.Series(deleted), axis=0)
        ddf = ddf.drop(pd.Series(done), axis=0)

        ddf = ddf.reset_index()

        ddf.date = ddf.date.dt.strftime('%Y-%m-%d')
        ddf = ddf.drop('bdate', axis=1)

        rubric_name = pd.read_pickle('rubric_names.pickle')[rubric_id]
        #group_name = ddf.group.unique()[0]
        #ddf_js = ddf.head().to_json(orient='values')

        #whole = json.loads(json.dumps(ddf.to_dict('records')[:300]).encode('utf-8').decode('utf-8', errors='ignore'))
        whole = ddf[:300].to_json(orient='records')

        obj = {
            'rubric_id': rubric_id,
            'rubric_name': rubric_name,
            'group_name': group_name,
            'done_count': len(done),
            'total_count': total_raw,
            'likes_max': max_likes,
            'likes_mean': mean_likes,
            'xars_list': xars,
            'whole_data': whole
        }

        return json.dumps(obj).encode('utf-8')
Beispiel #5
0
def main(year):

    stop_words = stopwords.words('russian')
    stop_words.extend([
        'что', 'че', 'чё'
        'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', 'если', '""',
        "''", '``'
    ])

    with DB() as db:
        comments = db.custom_get(
            '''select * from {0}_100k where date > {1}0101 and date < {1}1231'''
            .format(category, year), ())
    total_comments = len(comments)

    result = []
    for row in comments:
        #polarity = row[2]
        comment_id = row[0]
        comment = row[5].lower()
        comment = sub('\[id[0-9]*?\|.*?\]', '', comment)

        tokens = nltk.word_tokenize(comment)
        tokens = [i for i in tokens
                  if (i not in string.punctuation)]  #Eliminate punctuation
        tokens = [i for i in tokens
                  if (i not in stop_words)]  #Eliminate stop_words
        tokens = [sub('\W', '', i) for i in tokens]  #Eliminate non-words
        tokens = [i for i in tokens if i]  #Eliminate '' and ""

        tokens_pos = nltk.pos_tag(
            tokens,
            lang='rus')  # Detect POS of tokens and return tuple (token, POS)
        tokens = [
            i[0] for i in tokens_pos
            if (i[1] in ['V', 'S'] or 'A=' in i[1] or 'NUM' in i[1])
        ]  #Return only nouns and verbs ???
        #root_idxs = [tokens.index(i) for i in tokens if root_word in i]
        #tokens = [tokens[i-n:i+n] for i in root_idxs]

        # Flatten token lists of lists
        '''
        flatten = []
        for i in tokens:
            for x in i:
                flatten.append(x)
        tokens = flatten
        '''

        #tokens = [i for i in sublist for sublist in tokens]
        #normal_tokens = [morph.parse(i)[0].inflect({'sing', 'nomn'}).word for i in tokens]

        normal_tokens = [morph.normal_forms(i)[0]
                         for i in tokens]  #Lemmetize (returns normal form
        result += normal_tokens
        #result.append({'id': comment_id, 'text': comment, 'tokens': normal_tokens, 'polarity': polarity})
    return result, total_comments
Beispiel #6
0
def main(mode='keyPhrases'):
    comments = get_comments('сталин')

    seen = set()
    comms = []
    for comm in comments:
        if comm[0] not in seen:
            comms.append((comm[1], comm[2]))
            seen.add(comm[0])

    result = {}
    for n_sentence in ['1', '2', 'full']:

        comm_list = []
        req_list = []
        for comm in comms:
            comm_dict = dict(
                zip(['language', 'id', 'text'], [
                    'ru', comm[0], '.'.join(
                        sub('\[id[0-9]*?\|.*?\]', '',
                            comm[1]).split('.')[:int(n_sentence)])
                    if n_sentence != 'full' else sub('\[id[0-9]*?\|.*?\]', '',
                                                     comm[1])
                ]))
            if len(str(comm_list)) + len(
                    str(comm_dict)) < 524288 - 20 and len(comm_list) < 1000:
                comm_list.append(comm_dict)
            else:
                req_list.append(json.dumps({'documents': comm_list}))
                comm_list = []
                continue

        req_list.append(json.dumps({'documents': comm_list}))

        resp_list = []
        for req in req_list:
            resp_list.append(
                requests.post(URL.format(mode), data=req,
                              headers=header).json())

        result[n_sentence] = resp_list

    with DB() as db:
        for key, resp_list in result.items():
            col = 'score_{}'.format(
                key) if 'sentiment' in mode else 'keywords_{}'.format(key)
            for resp in resp_list:
                db.custom_put_many(
                    'update scores set {}=%s where comment_id=%s'.format(col),
                    tuple(
                        zip([
                            x['score'] if 'sentiment' in mode else '; '.join(
                                x['keyPhrases']) for x in resp['documents']
                        ], [x['id'] for x in resp['documents']])))
def main(group):

    with DB() as db:
        done = db.get_done()
        doneset = set(x[0] for x in done)

    try:
        posts_count = wall_get(group)['count']
        #posts_count = 20
    except Exception as e:
        print(e)
        raise e

    for n in range(posts_count // 100 + 1):
        try:
            offset = n*100
            posts = wall_get(group, offset=offset)
        except Exception as e:
            print(e)
            raise e

        for post in [x for x in posts['items']
                        if x['comments']['count'] > 0
                        and '{}_{}'.format(group, x['id']) not in doneset]:
            try:
                comments_count = post['comments']['count']
            except Exception as e:
                print(e)
                raise e

            for n in range(comments_count // 100 + 1):
                try:
                    offset = n*100
                    comments = wall_get_comments(group, post['id'], offset=offset)
                except Exception as e:
                    print(e)
                    raise e
                with DB() as db:
                    db.add_comments(group, post['id'], comments['items'])
                    db.add_users(comments['profiles'])
                    db.add_done(group, post['id'], comments_count, offset+len(comments['items']))
def main_stored(group):


    with DB() as db:
        offset = db.get_next_offset(group)[0][0]
        if not offset:
            offset = db.get_done_by_group(group)[0][0]

    offset = 0
    #posts_count = wall_get(group)['count']
    #posts_count = 45000

    #while offset/posts_count < 0.90:
    while offset < 1000:
        response = execute_get_comments(group, offset=offset)
        if not response:
            offset += 1
            print('Skipped request..')
            continue
            #break
        with DB() as db:
            for item in response['items']:
                #db.add_comments(abs(int(item['group'])), int(item['post']), item['comments']['items'])
                DB.add_comments_json(abs(int(item['group'])), int(item['post']), item['comments']['items'])
                #db.add_users(item['comments']['profiles'])
                DB.add_users_json(item['comments']['profiles'])
                db.add_done(abs(int(item['group'])), int(item['post']), item['comments']['count'], len(item['comments']['items']))
            offset = response['next_offset']
            #db.put_next_offset(offset, group)
        #diff = (int(response['items'][-1]['post']) - int(response['items'][0]['post'])) // len(response['items'])
        print('{} --- Acquired comments for posts upto: {} ({}). Next_offset is: {}. Group: {}'.format(datetime.now(), response['items'][-1]['post'], len(response['items']), response['next_offset'], response['items'][0]['group']))
Beispiel #9
0
 def populate_rubrics():
     for n in range(1, len(rubrics) + 1):
         key = 'rubrics/old/{}.json'.format(n)
         js = s3manager._s3_get_object(key)
         rubric_num = n
         rubric_name = js['rubric']
         ids = [x[0][0] for x in js['comments']]
         with DB() as db:
             db.custom_put_many(
                 'insert ignore into id_rubric_old (idx, comment_id, rubric_id) values (%s, %s, %s)',
                 tuple(
                     zip(['{}_{}'.format(n, x) for x in ids],
                         [x for x in ids], [n for x in ids])))
    def submit_feedback(self, **kwargs):
        response.headers['Content-Type'] = 'application/json'
        response.headers['Access-Control-Allow-Origin'] = '*'

        #payload = json.loads(request.body.read().decode('utf-8'))
        feedback = kwargs['feedback']
        rubric_id = kwargs['rubric_id']

        if feedback:
            with DB() as db:
                db.custom_put(
                    'insert ignore into rubric_feedback values (%s, %s)',
                    (rubric_id, feedback))
        return json.dumps({
            'ok': True,
            'feedback': feedback,
            'rubric_id': rubric_id
        }).encode('utf-8')
    def process_polarity(self, **kwargs):
        response.headers['Content-Type'] = 'application/json'
        response.headers['Access-Control-Allow-Origin'] = '*'
        index = kwargs['index']
        rubric_id = kwargs['rubric_id']
        idx = '{}_{}'.format(rubric_id, index)
        polarity = kwargs['polarity']
        with DB() as db:
            db.custom_put('update id_rubric set polarity=%swhere idx=%s',
                          (polarity, idx))
            total = db.custom_get(
                'select count(*) from id_rubric where rubric_id=%s and is_deleted=0',
                (rubric_id, ))[0][0]
            done = db.custom_get(
                'select count(*) from id_rubric where rubric_id=%s and polarity is not null',
                (rubric_id, ))[0][0]

        resp = {'done': done, 'total': total}
        return json.dumps(resp).encode('utf-8')
Beispiel #12
0
def get_comments(rubric):
    rubric_comments = []
    for expr in rubric:
        expr_lower = expr.lower()
        expr_upper = expr.upper()
        expr_cap = expr.capitalize()
        stmt = '''select * from nlp_isam.comments_isam_old_text where match(text) against(%s in boolean mode)'''
        #or
        #match(text) against(%s in boolean mode) or
        #match(text) against(%s in boolean mode) or
        #match(text) against(%s in boolean mode)'''
        args = (expr, )  #, expr_lower, expr_upper, expr_cap)
        with DB() as db:
            res = db.custom_get(stmt, args)

            def getmeta(comm_id):
                return db.custom_get(
                    '''select user_id, reply_to_user is not null as isreply from comments_isam_old join users_isam on comments_isam_old.user_id=users_isam.id where comments_isam_old.id=%s''',
                    (comm_id, ))[0]

            resmeta = [[x] + list(getmeta(x[0])) for x in res]

        def filter_unique(resmeta):
            seen = set()
            result = []
            for x in resmeta:
                if x[-2] not in seen:
                    result.append(x)
                    seen.add(x[-2])
            return result

        def filter_primary(resmeta):
            return [x for x in resmeta if x[-1] == 0]

        resmeta = filter_primary(resmeta)
        #rubric_comments += filter_unique(resmeta)
        rubric_comments += filter_unique(resmeta)
        #del res
        print('Acquired comments for rubric: {}'.format(expr))
    return rubric_comments
Beispiel #13
0
from nlpdb import DB
from acquire_comments import groups_getById as get_names
import json

db = DB()
groups_raw = db.custom_get('select * from last_offsets', ())
groups_total = [(x[0], x[-1]) for x in groups_raw]
group_ids = ','.join([str(x[0]) for x in groups_raw])
group_names = get_names(group_ids)
groups = [(y['id'], y['name'], [x[-1] for x in groups_raw
                                if x[0] == y['id']][0]) for y in group_names]
COUNTS = []
for group in groups:
    group_name = group[1]
    group_id = group[0]
    total = group[-1]

    count = db.custom_get(
        'select count(*) from nlp_isam.comments_isam where group_id=%s',
        (group_id, ))[0][0]
    posts = db.custom_get(
        'select count(distinct post_id) from nlp_isam.comments_isam where group_id=%s',
        (group_id, ))[0][0]

    count_2017 = db.custom_get(
        'select count(*) from nlp_isam.comments_isam where group_id=%s and date > "20170101" and date < "20171231"',
        (group_id, ))[0][0]
    count_2016 = db.custom_get(
        'select count(*) from nlp_isam.comments_isam where group_id=%s and date > "20160101" and date < "20161231"',
        (group_id, ))[0][0]
    count_2015 = db.custom_get(
Beispiel #14
0
 def take_15_300():
     with DB() as db:
         first_15 = db.custom_get('select rubric_id from ')
        with DB() as db:
            for item in response['items']:
                #db.add_comments(abs(int(item['group'])), int(item['post']), item['comments']['items'])
                DB.add_comments_json(abs(int(item['group'])), int(item['post']), item['comments']['items'])
                #db.add_users(item['comments']['profiles'])
                DB.add_users_json(item['comments']['profiles'])
                db.add_done(abs(int(item['group'])), int(item['post']), item['comments']['count'], len(item['comments']['items']))
            offset = response['next_offset']
            #db.put_next_offset(offset, group)
        #diff = (int(response['items'][-1]['post']) - int(response['items'][0]['post'])) // len(response['items'])
        print('{} --- Acquired comments for posts upto: {} ({}). Next_offset is: {}. Group: {}'.format(datetime.now(), response['items'][-1]['post'], len(response['items']), response['next_offset'], response['items'][0]['group']))


if __name__ == '__main__':

    with DB() as db:
        groups = db.custom_get('select last_offsets.group from last_offsets where offset/total < 0.8 or offset is null', ())
    groups = [x[0] for x in groups]

    #groups = groups_get()['items']
    #done = {24199209, 26284064, 40316705}
    #groups = [x for x in groups if x not in done]
    #groups = [76982440]
    for group in groups:
        main_stored(group)

    '''
    from multiprocessing import Pool
    with Pool(2) as pool:
        pool.map(main_stored, [x for x in groups])
        '''
Beispiel #16
0
def main_isp():
    KEY = '63e75cbccf7a1cbd88258c5217ca59331310d2f3'
    URL = 'http://api.ispras.ru/texterra/v1/nlp?'
    url = URL + 'targetType=polarity&tweet=true&apikey={}'.format(KEY)

    comments = get_comments()
    mode = 'polarity'
    seen = set()
    comms = []
    for comm in comments:
        if comm[0] not in seen:
            comms.append((comm[1], sub('\[id[0-9]*?\|.*?\]', '', comm[2])))
            seen.add(comm[0])

    result = {}
    for n_sentence in ['full']:

        comm_list = []
        req_list = []
        for comm in comms:
            comm_dict = {'text': comm[1], 'id': comm[0]}
            if len(str(comm_list)) + len(
                    str(comm_dict)) < 524288 - 20 and len(comm_list) < 50:
                comm_list.append(comm_dict)
            else:
                req_list.append(json.dumps(comm_list))
                comm_list = []
                continue

        req_list.append(json.dumps(comm_list))

        resp_list = []
        for req in req_list:
            _req = json.loads(req)
            texts = [x['text'] for x in _req]
            resp = requests.post(url,
                                 data=req,
                                 headers={
                                     'Content-Type': 'application/json',
                                     'Accept': 'application/json'
                                 }).json()
            r = [{
                'annotations': x['annotations'],
                'id': _req[texts.index(x['text'])]['id']
            } for x in resp]
            resp_list.append(r)

        result[n_sentence] = resp_list

    with DB() as db:
        for key, resp_list in result.items():
            #col = 'score_{}'.format(key) if 'sentiment' in mode else 'keywords_{}'.format(key)
            col = 'polarity' if 'polarity' in mode else 'keywords'

            for resp in resp_list:
                args = tuple(
                    zip([
                        decide_polarity(
                            x['annotations']['polarity'][0]['value'])
                        if len(x['annotations']) > 0 else 0 for x in resp
                    ], [x['id'] for x in resp]))
                db.custom_put_many(
                    'update scores_isp set {}=%s where comment_id=%s'.format(
                        col), args)
Beispiel #17
0
def get_comments():
    with DB() as db:
        comments = db.custom_get(
            'select users.id, stalin.id, stalin.text from users join stalin on users.id=stalin.user_id',
            ())
    return comments