Ejemplo n.º 1
0
def update_snippet_counts(file_id, count):
    con = get_engine()
    con.execute(
        text(
            "update file_uploads set reviews_uploaded = (select count(*) from source_reviews "
            "where file_id=file_uploads.id), "
            "review_fragments_generated = {};".format(
                count)).execution_options(autocommit=True))

    con.execute(
        text(
            "update account_subscription set reviews_uploaded = (select count(*) from source_reviews sr "
            "join file_uploads fp on fp.id=sr.file_id "
            "join customer cr on cr.id=fp.customer_id where cr.account_id=account_subscription.account_id), "
            "total_fragment_generated = (select sum(review_fragments_generated) from  file_uploads fp "
            "join customer cr on cr.id=fp.customer_id "
            "where cr.account_id=account_subscription.account_id), "
            "category_count=(select count(distinct sr.category_id) from source_reviews sr "
            "join file_uploads fp on fp.id=sr.file_id join customer cr on cr.id=fp.customer_id "
            "where cr.account_id=account_subscription.account_id);").
        execution_options(autocommit=True))

    # con.execute("update file_uploads set status='3' where id in ({});".format(''.join(file_ids)))

    con.dispose()
Ejemplo n.º 2
0
def get_file_token(file_id):
    is_id = check_record('file_uploads', 'id', file_id)
    if not is_id:
        return False
    con = get_engine()
    file_token = con.execute(
        "select file_token from file_uploads where id={};".format(file_id))
    file_token = list(file_token.fetchone())[0]
    return file_token
Ejemplo n.º 3
0
def get_columns(table):
    con = get_engine()
    cols = con.execute("desc {};".format(table))
    cols = [
        x[0] for x in list(cols)
        if x and x[0] not in [u'id', u'created_date', u'modified_date']
    ]
    con.dispose()
    return cols
Ejemplo n.º 4
0
def get_colmap(account_id):
    con = get_engine()
    col_maps = con.execute(
        "select source_column_name, target_column_name from source_review_column_map "
        "where account_id={};".format(account_id))
    col_maps = dict([(x[0].strip(), x[1].strip())
                     for x in col_maps.fetchall()])
    con.dispose()
    return col_maps
Ejemplo n.º 5
0
def sentiment_upload(file_name, table, file_id):
    con = get_engine()
    data = pd.read_csv(file_name,
                       sep='~',
                       quoting=csv.QUOTE_ALL,
                       encoding='utf-8')
    data['file_id'] = int(file_id)
    data = data[[
        'category_id', 'file_id', 'source_review_id', 'source_product_id',
        'aspect', 'sentiment_type', 'sentiment_text', 'start_index',
        'end_index', 'confidence-score'
    ]]
    data.rename(columns={
        'source_review_id': 'review_id',
        'source_product_id': 'product_id',
        'aspect': 'aspect_id',
        'confidence-score': 'confidence_score'
    },
                inplace=True)
    con.execute("delete from {} where file_id in ({});".format(table, file_id))
    print "Upload is started"

    data = data.sort_values(['confidence_score'], ascending=False)
    data = data.drop_duplicates(
        ['product_id', 'review_id', 'aspect_id', 'sentiment_text'],
        keep='first')
    data.to_sql(table,
                if_exists='append',
                con=con,
                index=False,
                chunksize=1000)
    # con.execute("update file_uploads set status='3' where id={};".format(file_id))
    print "Upload is completed for file {}".format(file_id)
    con.execute(
        "update file_uploads set status=2 where id = {};".format(file_id))
    # Send notification mail to the user
    # user_mail = con.execute("select email, first_name, file_token from customer c join file_uploads f on "
    # "f.customer_id=c.id where f.id={};".format(file_id))
    # user_mail, user_name, file_token = user_mail.fetchone()

    # Close the connection
    con.dispose()
    # try:
    #     requests.get("http://192.168.2.112:8080/completeAnalysis?fileToken={}".format(file_token), timeout=5)
    #     # send_mail.send_mail('*****@*****.**', [user_mail], "Completed processing your file!", "Dear {},
    #     # <br/> Successfully processed your reviews file and extracted the sentiment snippets." "<br/>You can
    #     # download the file from the downloads section!".format(user_name))
    # except:
    #     pass
    return True
Ejemplo n.º 6
0
def update_column(file_id, file_url):
    con = get_engine()
    try:
        file_url = file_url.split("?")[0]
        query = """update file_uploads set download_url="{}" where id={};""".format(
            file_url, file_id)
        con.execute(query)
        time.sleep(1)
        con.dispose()
        return True
    except Exception as Ex:
        con.dispose()
        logging.error("Failed to update download url!\n{}".format(str(Ex)))
        return False
Ejemplo n.º 7
0
def check_record(table, column, value):
    con = get_engine()
    recs = 0
    if str(value).isdigit():
        recs = con.execute("select * from {} where {}={};".format(
            table, column, value))
    else:
        recs = con.execute("select * from {} where {}='{}';".format(
            table, column, value))
    con.dispose()
    if len(list(recs)):
        return True
    else:
        return False
Ejemplo n.º 8
0
def review_uploader(file_name, table, file_id, category_id):
    con = get_engine()
    data = pd.read_csv(file_name,
                       sep='~',
                       quoting=csv.QUOTE_ALL,
                       encoding='utf-8')
    data['file_id'] = int(file_id)
    data.rename(columns={
        'source_review_id': 'review_id',
        'source_product_id': 'product_id',
        'review_tag': 'review_title'
    },
                inplace=True)
    data['category_id'] = category_id
    data = data[[
        'product_id', 'review_id', 'product_name', 'reviewer_name',
        'star_rating', 'review_date', 'review_title', 'category_id',
        'review_url', 'review_text', 'file_id'
    ]]
    data.to_sql(table,
                if_exists='append',
                con=con,
                index=False,
                chunksize=1000)
    con.execute(
        "update file_uploads set reviews_uploaded = (select count(*) from source_reviews "
        "where file_id=file_uploads.id);")
    con.execute(
        "update account_subscription set reviews_uploaded = (select count(*) from source_reviews sr "
        "join file_uploads fp on fp.id=sr.file_id join customer cr on cr.id=fp.customer_id "
        "where cr.account_id=account_subscription.account_id), "
        "category_count=(select count(distinct sr.category_id) from source_reviews sr "
        "join file_uploads fp on fp.id=sr.file_id join customer cr on cr.id=fp.customer_id "
        "where cr.account_id=account_subscription.account_id);")
    # Close the connection
    con.dispose()
Ejemplo n.º 9
0
def table_to_file(table, file_id, sep=','):
    con = get_engine()
    table_recs = pd.read_sql(
        "select o.*, sr.review_text from {} o join source_reviews sr on sr.file_id=o.file_id "
        "and sr.review_id=o.review_id where o.file_id={} "
        "and sr.file_id={};".format(table, file_id, file_id),
        con=con)
    reviews = pd.read_sql(
        "select review_id from source_reviews where file_id={};".format(
            file_id),
        con=con)

    aspect_names = pd.read_sql(
        "select aspect_id, aspect_name "
        "from client_category_aspect "
        "where (account_id, category_id) in (select c.account_id, f.category_id "
        "from file_uploads f join customer c on c.id=f.customer_id "
        "where f.id={});".format(file_id),
        con=con)

    cat_aspect_names = pd.read_sql(
        "select id as aspect_id, aspect_name from category_aspect where category_id in "
        "(select category_id from file_uploads where id={});".format(file_id),
        con=con)

    # Coercing to int because the db column of aspect_id is varchar as we were using aspect name
    table_recs.aspect_id = table_recs.aspect_id.map(int)

    # separate the validate status 'Y' and 'N'
    error_log = table_recs[~(table_recs.validate_status == 'Y')]
    table_recs = table_recs[(table_recs.validate_status == 'Y')]

    # Merge the aspect names based on aspect_id
    table_recs = table_recs.merge(aspect_names, on=['aspect_id'], how='left')
    error_log = error_log.merge(cat_aspect_names, on=['aspect_id'], how='left')

    # Extracting counts
    snippet_count = table_recs.shape[0]
    review_count = reviews.shape[0]

    positive = table_recs[table_recs.sentiment_type ==
                          'positive'].confidence_score
    negative = table_recs[table_recs.sentiment_type ==
                          'negative'].confidence_score
    pmean, pstd = positive.mean(), positive.std()
    nmean, nstd = negative.mean(), negative.std()
    norm = abs(pmean - nmean) / (pmean + nmean)
    if norm > 0.2:
        bias_msg = "Positive and negative confidence_score distributions are non-matching."
    elif pmean < 0.5:
        bias_msg = "Positive confidence_score distribution is biased towards zero."
    elif nmean < 0.5:
        bias_msg = "Negative confidence_score distribution is biased towards zero."
    else:
        bias_msg = "Confidence Score distribution seems good."

    del table_recs['aspect_id']
    del table_recs['id']
    try:
        del table_recs['validate_status']
        del table_recs['validate_comment']
    except:
        pass

    category_name = con.execute(
        "select c.category_name, f.file_name from category c join file_uploads f "
        "on f.category_id=c.id where f.id={};".format(file_id))
    category_name, raw_file_name = category_name.fetchone()
    con.dispose()

    base_name = "{}_{}_output.csv".format(raw_file_name, file_id)
    base_name2 = "{}_{}_error_log.csv".format(raw_file_name, file_id)
    file_name = app.saas_config['downloads_dir'] + base_name
    file_name2 = app.saas_config['downloads_dir'] + base_name2
    table_recs.to_csv(file_name,
                      sep=sep,
                      quoting=csv.QUOTE_ALL,
                      encoding='utf8',
                      index=False)
    error_log.to_csv(file_name2,
                     sep=sep,
                     quoting=csv.QUOTE_ALL,
                     encoding='utf8',
                     index=False)
    zip_file_path = app.saas_config['downloads_dir'] + base_name.replace(
        '.csv', '.zip')
    zip_status = zip_files([file_name, file_name2], zip_file_path)
    if zip_status['errors']:
        raise (ValueError, "Failed in zipping the files!")
    try:
        os.remove(file_name)
        os.remove(file_name2)
    except:
        pass
    return zip_file_path, review_count, snippet_count, pmean, nmean, bias_msg
Ejemplo n.º 10
0
def file_upload(file_path, table, sep=',', req_ip="kiwi.enixta.com"):
    con = get_engine()
    try:
        records = pd.read_csv(file_path, sep=sep, quoting=csv.QUOTE_ALL)
    except parsers.ParserError:
        try:
            records = pd.read_csv(file_path,
                                  sep=sep,
                                  quotechar='"',
                                  escapechar='\\')
        except Exception as Ex:
            con.dispose()
            return {
                "errors": [
                    "There was some error reading the file! Please format the file with proper quoting and "
                    "delimiter as ','(comma)."
                ],
                "messages": [
                    "There was some error reading the file! Please format the file with proper quoting "
                    "and delimiter as ','(comma)."
                ]
            }
    records = records[~(records.file_id.isnull())]
    file_ids = map(int, map(float, map(str, list(records.file_id.unique()))))

    # Drop the unwanted columns
    records = records[[
        'category_id', 'file_id', 'review_id', 'product_id', 'aspect_name',
        'sentiment_type', 'sentiment_text', 'start_index', 'end_index',
        'confidence_score'
    ]]

    # Strip the starting and trailing spaces for text fields
    records = records.apply(
        lambda row:
        [x.strip() if isinstance(x, (str, unicode)) else x for x in row],
        axis=1)

    # Check the existence of file_id in db
    missing_file_ids = []
    for file_id in file_ids:
        is_file_id = check_record('file_uploads', 'id', file_id)
        if not is_file_id:
            missing_file_ids.append(file_id)
    if missing_file_ids:
        con.dispose()
        return {
            "errors":
            "Missing file_ids in database {}! ".format(','.join(
                map(str, missing_file_ids))),
            "messages":
            ""
        }

    # check for multiple file ids in the file
    if len(file_ids) > 1:
        con.dispose()
        return {
            "errors":
            "Multiple file ids found in the file {}.".format(file_path),
            "messages": ""
        }

    # Raise error if there are no file ids present
    if len(file_ids) == 0:
        con.dispose()
        return {
            "errors":
            "File id column is empty in the file {}.".format(file_path),
            "messages": ""
        }

    # get the unique aspect names from the file
    aspect_names = list(records.aspect_name.unique())
    # check for the missing / extra aspects present in the file
    extra_aspects = check_aspects(file_ids[0], aspect_names, con)
    if extra_aspects:
        # if extra aspects present in the file return an error
        con.dispose()
        return {
            "errors":
            "Not required aspect names are present in the file!\n{}".format(
                ','.join(map(str, extra_aspects))),
            "messages":
            ""
        }

    # Todo: Need to add the error return code to polarity also
    # remove the not required polarity snippets in the file
    records = records[records.sentiment_type.isin(['positive', 'negative'])]

    # Delete the entries from the db table if pre-existing
    if file_ids:
        # Todo: Try to suppress the reference check in db
        con.execute("delete from {} where file_id in ({});".format(
            table, ','.join(map(str, file_ids))))
    # Todo: make db upload async
    records.to_sql(table,
                   if_exists='append',
                   con=con,
                   index=False,
                   chunksize=1000)

    file_id = file_ids[0]
    user_mail = con.execute(
        "select email, first_name, file_token from customer c join file_uploads f on "
        "f.customer_id=c.id where f.id={};".format(file_id))
    user_mail, user_name, file_token = user_mail.fetchone()
    final_files = {'id': file_id, 'token': file_token}

    con.dispose()
    return {
        "data":
        final_files,
        "errors":
        "",
        "messages":
        "Successfully uploaded the file to {}! "
        "Valid record count is {}.".format(
            table.replace('_', ' ').title(), records.shape[0])
    }
def duplicate_merge(file_id, level=1):
    db = get_engine()
    file_name = "downloads/index_merged_{}_{}.csv".format(file_id, level)
    if level == 1:
        snippet_table = 'sentiment_output_temp'
    elif level == 2:
        snippet_table = 'sentiment_ouput'
    reviews_without_text = pd.read_sql(
        "select * from {} where file_id={};".format(snippet_table, file_id),
        con=db)
    reviews_without_text.rename(columns={'aspect_name': 'aspect_id'},
                                inplace=True)
    reviews_without_text = reviews_without_text[[
        'id', 'product_id', 'review_id', 'aspect_id', 'sentiment_text'
    ]]
    time.sleep(1)
    review_text = pd.read_sql(
        "select review_id, review_text from source_reviews "
        "where file_id={};".format(file_id),
        con=db)

    reviews_with_text = reviews_without_text.merge(review_text,
                                                   on=['review_id'],
                                                   how='left')
    reviews_with_text = pd.DataFrame(
        list(reviews_with_text.apply(df_unicode, axis=1)))
    merged_text_dicts = {}
    for row in reviews_with_text.to_dict('records'):
        source_review_id = row['review_id']
        aspect = row['aspect_id']
        if source_review_id not in merged_text_dicts:
            merged_text_dicts[source_review_id] = {}
        if aspect not in merged_text_dicts[source_review_id]:
            merged_text_dicts[source_review_id][aspect] = []
        merged_text_dicts[source_review_id][aspect].append(row)
    merged_text_dicts = merged_text_dicts.values()

    del reviews_with_text

    outfile_path = file_name.split('.')[0] + 'output.csv'

    run_parallel(merged_text_dicts, outfile_path)

    out_reviews = pd.read_csv(outfile_path,
                              quoting=csv.QUOTE_ALL,
                              delimiter=',')
    # No Snippet Found
    index_no_sentiment_text = out_reviews[['start_index', 'end_index']].apply(
        lambda row: True
        if row['start_index'] == -1 or row['end_index'] == -1 else False,
        axis=1)
    errors_snippet_not_found = out_reviews[index_no_sentiment_text]
    if level == 1 and not errors_snippet_not_found.empty:
        cant_find_snip_ids = ', '.join(
            map(str, map(int, list(errors_snippet_not_found.id.unique()))))
        db.execute(
            "update {} set validate_status='N', validate_comment='Not found in review' "
            "where id in ({});".format(snippet_table, cant_find_snip_ids))
        out_reviews = out_reviews[~index_no_sentiment_text]

    merged_reviews = out_reviews[out_reviews.overlap == 1]

    if level == 1 and not merged_reviews.empty:
        merged_ids = ', '.join(
            map(str, map(int, list(merged_reviews.id.unique()))))
        db.execute(
            "update {} set validate_status='N', validate_comment='Overlapping' "
            "where id in ({});".format(snippet_table, merged_ids))
    for row in out_reviews.to_dict('records'):
        rec_id = row['id']
        start_index = row['start_index']
        end_index = row['end_index']
        sentiment_text = row['sentiment_text']

        query = text("update {} set start_index=:sindx, end_index=:eindx, sentiment_text=:stext " \
                     "where id=:rid".format(snippet_table))
        db.execute(
            query, {
                'sindx': start_index,
                'eindx': end_index,
                'stext': sentiment_text,
                'rid': rec_id
            })
    try:
        os.remove(outfile_path)
    except:
        pass
    db.dispose()

    try:
        os.remove(outfile_path)
    except:
        pass
    return True
def output_to_db():
    # Parsing parameters from the request
    args = dict(request.args)
    file_url = args.get('file', [None])[0]
    req_ip = request.remote_addr
    if file_url:
        # download the file from the url
        file_name = download_file(file_url)

        # Get column names of the target table from db
        table_cols = get_columns('sentiment_ouput')
        # Check the required columns present in the file or not
        missing_cols = file_validation_checks.check_file_header(file_name,
                                                                table_cols,
                                                                sep=',')
        if not missing_cols:
            # Uploading final files to DB
            file_creation = file_upload(file_name, 'sentiment_ouput', sep=',')

            # if errors in review send alert to internal mail group
            if file_creation['errors']:
                msg = {'message': file_creation}
                app.logger.error(str(file_creation['errors']))
                send_mail(
                    alert_mails,
                    "Error in generating final output file for file {}".format(
                        file_name),
                    "Error: {}".format(file_creation['errors']))
                return return_err_response(msg, 298)

            file_status = file_creation['data']
            file_id = file_status.get('id')
            file_token = file_status.get('token')

            # Start index generation and duplicate snippet merging
            duplicates_merge.duplicate_merge(file_id, 2)
            db = get_engine()

            # create the final output file for s3 upload
            file_path = create_final_output_file(file_id, db)
            if 'errors' in file_path:
                db.dispose()
                msg = {'message': {'errors': 'Error in final file creation!'}}
                send_mail(
                    alert_mails,
                    "Error in generating final output file for file {}".format(
                        file_name), "Error: {}".format(file_path['errors']))
                return return_err_response(msg, 298)
            db.dispose()

            file_path = file_path['file']

            # Upload the final output to s3
            final_file_url = upload_to_s3(file_path)

            # Update the s3 file url in db
            update_column(file_id=file_id, file_url=final_file_url)

            # Call the Complete analysis api to send an alert to the customer
            try:
                requests.get(
                    "https://{}:8443/completeAnalysis?fileToken={}".format(
                        str(req_ip), file_token),
                    timeout=10,
                    verify=False)
            except Exception as apiex:
                app.logger.error(
                    "Failed to call the process completion email api!\n{}".
                    format(str(apiex)))

            return return_err_response({'message': file_creation}, 200)

        # For missing columns
        else:
            msg = {
                'message': {
                    "errors":
                    "Missing columns in the file!"
                    "Few columns are missing in the file! {}".format(
                        ','.join(missing_cols))
                }
            }
            return return_err_response(msg, 298)

    # If file url is missing in the request parameters
    else:
        msg = {
            'message': {
                "errors": "Missing file_url in the parameter list!"
            }
        }
        return return_err_response(msg, 298)
def run_smartpulse():
    args = dict(request.args)
    file_id = args.get('file_id', [None])[0]
    category_id = int(args.get('category_id', [0])[0])
    category = args.get('category', [None])[0]
    file_url = args.get('file', [None])[0]
    classification = "sentiment"
    account_id = int(args.get('account_id', [0])[0])

    if all([file_id, category_id, account_id, classification]):
        file_name_db = get_file_name(file_id)
        # download the file from the url
        file_name = download_file(file_url)
        colmap = get_colmap(account_id)
        if not colmap:
            send_mail(
                alert_mails,
                "Error processing file {}! File configuration is missing "
                "for account id {}".format(file_name_db, account_id),
                "Column mapping is missing for account id {}!".format(
                    account_id))
            msg = {
                'message': {
                    "errors":
                    "Column mapping is missing for account id {}! ".format(
                        account_id)
                }
            }
            return return_err_response(msg, 299)

        # format the file as required for smartpulse module
        check_point = file_validation_checks.order_columns(
            file_name, [
                'source', 'source_review_id', 'source_product_id',
                'product_name', 'review_date', 'star_rating', 'verified_user',
                'reviewer_name', 'review_url', 'review_tag', 'review_text'
            ], colmap)
        if check_point['errors']:
            app.logger.error(str(check_point['errors']))
            msg = {'message': check_point}
            return return_err_response(msg, 299)
        time.sleep(10)

        # Check the file_id existance in file_uploads
        file_exist = check_record('file_uploads', 'id', file_id)
        if not file_exist:
            send_mail(
                alert_mails,
                "Error occured while uploading reviews to database for file id {}!"
                .format(file_id),
                "Invalid file id {}. File id is missing in the file_uploads table!"
                .format(file_id))
            msg = {
                'message': {
                    "errors":
                    "No record for file_id {} in database! ".format(file_id)
                }
            }
            return return_err_response(msg, 299)

        # Todo: Create a function to extract the lexicon data from db and create config dir with that data :completed
        # Creating config for smartpulse generation
        root_dir = os.path.join(BASE_DIR, 'smaartpulse/lexicons_db')
        final_dir = os.path.join(BASE_DIR, 'downloads')
        db = get_engine()
        pulse_config = LexiconMetadata(root_dir, file_id, account_id,
                                       category_id, db, file_name, final_dir,
                                       classification)
        db.dispose()
        if not pulse_config.status:
            error_msg = pulse_config.status_msg
            send_mail(
                alert_mails,
                "Lexicon config error for category {} with id {}.".format(
                    pulse_config.category, category_id), error_msg)
            return return_err_response({'errors': error_msg}, 299)

        # Start uploading the reviews to database
        try:
            review_uploader(pulse_config.input_file, 'source_reviews', file_id,
                            category_id)
        except sqlalchemy.exc.IntegrityError as err:
            trace = traceback.format_exc()
            app.logger.error(str(trace))
            if "Duplicate entry" in str(err):
                msg = {
                    'message': {
                        "errors":
                        "Some error has occured while uploading reviews to database! "
                        "Duplicate entry for the table Source Reviews."
                    }
                }
                send_mail(
                    alert_mails,
                    "Error occured while uploading reviews to database for file {} with "
                    "id {}!".format(file_name_db, file_id),
                    "Duplicate entry for the table Source Reviews.")
                return return_err_response(msg, 299)
        except Exception as Ex:
            trace = traceback.format_exc()
            app.logger.error(str(trace))
            send_mail(
                alert_mails,
                "Error occured while uploading reviews to database for file {} with "
                "id {}!".format(file_name_db, file_id),
                "There was error occured while uploading reviews!")
            msg = {
                'message': {
                    "errors":
                    "Some error has occured while uploading reviews to database! "
                    "Sent an internal alert for notification."
                }
            }
            return return_err_response(msg, 299)

        # Trigger the smartpulse module
        smartpulse.trigger_smartpulse(pulse_config)
        status = pulse_config.output

        result = status.get('status', None)
        if result:
            return upload_output(pulse_config, file_id=file_id)

        else:
            app.logger.error(str(status['error_stack']))
            send_mail(
                alert_mails,
                "Error in generating smartpulse for file {} with id {}".format(
                    file_name, file_id),
                str('\n'.join(status['errors'])).replace('\\n', '<br/>') +
                "<br/><br/><b>Error Stack:</b>" +
                str(status['error_stack']).replace('\\n', '<br/>'))
            return return_err_response(status, 299)
    else:
        missing_parameters = []
        for param, value in dict(
                zip(['file_id', 'category_id', 'accout_id', 'classification'],
                    [file_id, category_id, account_id, classification
                     ])).items():
            if not value:
                missing_parameters.append(param)
        return parameter_missing_error(missing_parameters)
def upload_output(pulse_config, file_id):
    result_dict = pulse_config.output

    # Read the treemap file from the result of smartpulse module
    treemap_file = result_dict['output_file']
    print "Upload output ", treemap_file
    # Verify the file structure and data whether it is as required
    checks = file_validation_checks.file_validator(outputfile=treemap_file)
    file_name = get_file_name(file_id)
    # If errors sendout a mail to the internal group
    if checks['errors']:
        # Output file validation error send mail to ml-team
        app.logger.error(str(checks['errors']))
        msg = {'message': checks}
        send_mail(
            alert_mails,
            "Error in generating smaartpulse for file {} (id {})".format(
                file_name, file_id),
            str('\n'.join(checks['errors'])).replace('\n', '<br/>'))
        return return_err_response(msg, 299)

    else:
        # if checks['errors'] and False:
        #     send_mail(['*****@*****.**'],
        #                         "Error in generating smartpulse for file id {}".format(file_id),
        #                         str('\n'.join(checks['errors'])).replace('\n', '<br/>'))
        try:
            # Upload snippets to database
            upload = sentiment_upload(treemap_file, 'sentiment_output_temp',
                                      file_id)
            if upload:
                # Start validating the snippets

                # External lexicon validation implemented in backend
                lexicon_validation(file_id)

                # Update the not required snippets status to 'N'
                db = get_engine()
                db.execute(
                    "update sentiment_output_temp set validate_status='N', validate_comment='Aspect not required' "
                    "where aspect_id not in (select aspect_id from client_category_aspect "
                    "where (account_id, category_id) in (select c.account_id, f.category_id "
                    "from file_uploads f join customer c on c.id=f.customer_id where f.id={})) "
                    "and file_id={}".format(file_id, file_id))
                time.sleep(0.5)
                db.execute(
                    "update sentiment_output_temp set validate_status='N', validate_comment='Polarity not required' "
                    "where file_id={} "
                    "and LOWER(trim(sentiment_type)) not in ('positive','negative');"
                    .format(file_id))
                # Close the db connection
                db.dispose()

                app.index_merge_queue.append((file_id, 'index', 1))
                merged_files = app.index_merge_queue_output
                while file_id not in merged_files:
                    time.sleep(60)
                merged_files.remove(file_id)
                print "got merged file for nlp"
                gnlp = google_nlp_validation(file_id, file_name)
                if gnlp:
                    file_for_validation(file_id)
                else:
                    return return_err_response(
                        "Error in google nlp validation", 299)
            try:
                shutil.rmtree(pulse_config.client_dir)
            except:
                print "error in deleting client lexicon config dir"
            return return_response("successfully generated snippets.", 200)
        except Exception as Ex:
            trace = traceback.format_exc()
            app.logger.error(str(trace))
            msg = {'message': {'errors': [str(Ex)], 'error_stack': str(trace)}}
            send_mail(
                alert_mails,
                "Error in uploading smartpulse to db for file {} with id {}".
                format(file_name, file_id),
                str(Ex).replace('\\n', '<br/>') +
                "<br/><br/><b>Error Stack:</b>" +
                str(trace).replace('\\n', '<br/>'))
            return return_err_response(msg, 299)
Ejemplo n.º 15
0
def google_nlp(file_id, output_path='downloads/google'):
    output_file = os.path.join(output_path,
                               '{}_google_nlp.csv'.format(file_id))
    cluster_size = 1000

    db = get_engine()
    reviews = pd.read_sql(
        "select product_id, review_id, review_text from source_reviews "
        "where file_id={} and (file_id, review_id) in "
        "(select distinct file_id, review_id from sentiment_output_temp "
        "where file_id={} and validate_status='Y');".format(file_id, file_id),
        con=db)
    reviews = reviews[['product_id', 'review_id', 'review_text'
                       ]].drop_duplicates(['review_id', 'review_text'])
    reviews = pd.DataFrame(list(reviews.apply(df_unicode, axis=1)))
    if reviews.empty:
        raise (ValueError, "No Reviews found for file id {}".format(file_id))
    reviews.review_text = reviews.review_text.apply(punctuation_correction)

    try:
        previous_run = pd.read_csv(output_file,
                                   delimiter=',',
                                   quotechar='"',
                                   names=[
                                       'product_id', 'review_id', 'sentence',
                                       'sentiment_score', 'sentiment_magnitude'
                                   ])
        reviews = reviews[~reviews.review_id.
                          isin(set(list(previous_run.review_id)))]
        del previous_run

    except:
        pass

    print "Review Count is ", reviews.shape[0]

    if not reviews.empty:
        credentials_list = get_creds()
        manager = mp.Manager()
        review_queue = manager.Queue()

        for product_id in set(list(reviews.product_id)):
            review_texts = reviews[reviews.product_id == product_id].to_dict(
                'records')
            row_count = len(review_texts)
            clusters = (row_count / cluster_size) if (
                row_count % cluster_size == 0) else (row_count / cluster_size +
                                                     1)
            for cl in range(clusters):
                cluster_range1 = cl * cluster_size
                cluster_range2 = min([(cl + 1) * cluster_size, row_count])
                cluster = review_texts[cluster_range1:cluster_range2]
                review_queue.put(cluster)

        output_queue = manager.Queue()

        # Start all the scraping processes
        write_process = mp.Process(target=to_file,
                                   args=(output_queue, 1, output_file))
        write_process.start()

        gcloud_client(review_queue,
                      output_queue,
                      credentials_list,
                      direct=True)

        # Join all the scraping processes after completion
        write_process.join()
        write_process.terminate()

    logger.info(
        "Completed Extracting google nlp snippets for file_id {}".format(
            file_id))

    nlp_snippets = pd.read_csv(output_file,
                               sep=',',
                               quotechar='"',
                               quoting=csv.QUOTE_ALL,
                               names=[
                                   'product_id', 'review_id', 'sentence',
                                   'sentiment_score', 'sentiment_magnitude'
                               ]).drop_duplicates()
    nlp_snippets = nlp_snippets[~(nlp_snippets.sentence.isnull())]
    sent = []
    for row in nlp_snippets.to_dict("records"):
        if float(row["sentiment_score"]) > 0.0:
            row["sentiment"] = "positive"
        elif float(row["sentiment_score"]) < -0.25:
            row["sentiment"] = "negative"
        else:
            row["sentiment"] = "neutral"
        sent.append(row)
    nlp_snippets = pd.DataFrame.from_dict(sent)
    del sent
    enx_snippets = pd.read_sql(
        "select id, review_id, sentiment_text, sentiment_type, confidence_score "
        "from sentiment_output_temp "
        "where file_id={} and validate_status='Y';".format(file_id),
        con=db)

    review_ids = list(enx_snippets.review_id.unique())
    enx_snippets['sent_text_cleaned'] = enx_snippets.sentiment_text.apply(
        lambda x: ''.join([y for y in x.lower() if y.isalnum()]))
    enx_snippets_dict = dict()
    for row in enx_snippets.to_dict('records'):
        review_id = row['review_id']
        if not review_id in enx_snippets_dict:
            enx_snippets_dict[review_id] = list()
        enx_snippets_dict[review_id].append(row)
    del enx_snippets

    nlp_snippets['sent_text_cleaned'] = nlp_snippets.sentence.apply(
        lambda x: ''.join([y for y in x.lower() if y.isalnum()]))
    nlp_snippets_dict = dict()
    for row in nlp_snippets.to_dict('records'):
        review_id = row['review_id']
        if not review_id in nlp_snippets_dict:
            nlp_snippets_dict[review_id] = list()
            nlp_snippets_dict[review_id].append(row)
    del nlp_snippets

    match = []
    non_match = []
    print "Started validation against google nlp output!"
    for review_id in review_ids:
        reviews_google = nlp_snippets_dict[review_id]
        reviews_enixta = enx_snippets_dict[review_id]
        for row1 in reviews_enixta:
            row_id = row1['id']
            found = False
            for row2 in reviews_google:
                if row1['sent_text_cleaned'] in row2['sent_text_cleaned'] \
                        or row2['sent_text_cleaned'] in row1['sent_text_cleaned']:
                    found = True
                    row1['sentiment_type'] = row1['sentiment_type'].strip()
                    row1['sentiment_type'] = row1['sentiment_type'].replace(
                        'most-positive', 'positive')
                    row1['sentiment_type'] = row1['sentiment_type'].replace(
                        'most-negative', 'negative')
                    if row2['sentiment'] == row1['sentiment_type'] or row1[
                            'confidence_score'] > 0.95:
                        match.append(row_id)
                    else:
                        non_match.append(row_id)
                    break
            if not found:
                if row1['confidence_score'] > 0.95:
                    match.append(row_id)
                else:
                    non_match.append(row_id)
    non_match_ids = ', '.join(map(str, non_match))
    print "completed google nlp comparision"
    if non_match_ids:
        query = "update sentiment_output_temp set validate_status='N', validate_comment='Failed Google NLP Validation' " \
                "where id in ({})".format(non_match_ids)
        db.execute(query)

    db.dispose()
    return True