Example #1
0
    def getTopKWS(self, ids):
        conf = getconf2()
        s = SqlFuncs(conf)

        # Get blogsites in tracker
        connection = s.get_connection(conf)
        with connection.cursor() as cursor:
            cursor.execute(f"""
                select n.term, sum(n.occurr) occurrence
                from blogpost_terms_api, 
                json_table(terms_test,
                '$[*]' columns(
                    term varchar(255) path '$.term',
                    occurr int(11) path '$.occurrence'
                    )
                ) 
                as n
                where blogsiteid in ({ids})
                group by n.term
                order by occurrence desc
                limit 100;
            """)

            records = cursor.fetchall()

            if records:
                data = [x['term'] for x in records]
            else:
                data = None

        connection.close()
        cursor.close()

        return data
Example #2
0
    def updateStatus(self, status, tid):
        conn = getconf2()
        s = SqlFuncs(conn)

        if int(status) < 100:
            data = 0, status, tid
            s.update_insert(
                "update tracker_keyword set status = %s, status_percentage = %s where tid = %s",
                data, conn)
        else:
            data = 1, status, tid
            s.update_insert(
                "update tracker_keyword set status = %s, status_percentage = %s where tid = %s",
                data, conn)
Example #3
0
    def testingKWT(self, tid, ip, parallel, update__status, num_processes):
        conf = getconf2()
        s = SqlFuncs(conf)

        # Get blogsites in tracker
        connection = s.get_connection(conf)
        with connection.cursor() as cursor:
            cursor.execute(f"""select * from trackers where tid = {tid}""")
            records = cursor.fetchall()
            if records:
                query = records[0]['query']
                if 'blogsite_id in (' in query:
                    blog_ids = query[query.find("(") + 1:query.find(")")]
                    if blog_ids and 'NaN' not in blog_ids:
                        blog_ids = blog_ids[:-1] if ',' == blog_ids[
                            -1] else blog_ids

                        cursor.execute(
                            f"""select post from blogposts where blogsite_id in ({blog_ids})"""
                        )
                        records = cursor.fetchall()

                        # Get top terms from blog_ids
                        try:
                            terms_result = getTopKWS(blog_ids)
                        except Exception as e:
                            print(e)
                            try:
                                print('Retrying...')
                                terms_result = getTopKWS(blog_ids)
                                print('success')
                            except Exception as e:
                                terms_result = []
                                print(e)

                        # Count terms and group by year
                        if terms_result:
                            data_ = []
                            for term in terms_result:
                                PARAMS = term, blog_ids, tid
                                data_.append(PARAMS)

                            if parallel:
                                print("starting multi-process")
                                process_pool = ProcessPool(num_processes)
                                pbar = tqdm(process_pool.imap(
                                    single_process, data_),
                                            desc="Terms",
                                            ascii=True,
                                            file=sys.stdout,
                                            total=len(data_))
                                for x in pbar:
                                    pbar.update(1)

                                    # Update status on DB
                                    if update__status:
                                        status = round(
                                            (pbar.last_print_n / len(data_)) *
                                            100)
                                        if status <= 99 and status >= 90:
                                            status = 100
                                        updateStatus(status, tid)

                                process_pool.close()
                                print("Joining pool")
                                process_pool.join()
                                print("Clearing pool")
                                process_pool.clear()
                                print("Finished!")

                            else:
                                for x in data_:
                                    single_process(x)
Example #4
0
    def single_process(self, parameters):
        from Utils.functions import clean_text, getconf2
        from Utils.sql import SqlFuncs
        import pandas as pd
        import ast

        term, blog_ids, tid = parameters
        # start = time.time()
        # print('here-2')

        conf = getconf2()
        s = SqlFuncs(conf)

        # Get blogsites in tracker
        connection = s.get_connection(conf)
        with connection.cursor() as cursor:
            cursor.execute(f"""
            select * from 
            (SELECT date, post,title, blogsite_id, blogpost_id,
            ROUND ((LENGTH(lower(post)) - LENGTH(REPLACE (lower(post), "{term}", ""))) / LENGTH("{term}")) AS count 
            from (select * from blogposts where match (title, post) against ('{term}' IN BOOLEAN MODE)) bp) a where blogsite_id in ({blog_ids})
            """)

            final_data = {}
            total_data = {}

            records = cursor.fetchall()

            if records:
                df = pd.DataFrame(records)
                df['count'] = df['count'].map(int)
                df['date'] = pd.to_datetime(df['date'], errors='coerce')
                year_count = df.groupby(
                    df.date.dt.year)['count'].sum().to_dict()

                final_data[term] = year_count
                total_data[term] = sum(list(year_count.values()))

                # check if key already exists in DB
                cursor.execute(
                    f'select terms, keyword_trend from tracker_keyword where tid = {tid}'
                )
                checked_result = cursor.fetchall()

                if checked_result:
                    terms_checked = ast.literal_eval(
                        checked_result[0]
                        ['terms']) if 'terms' in checked_result[0] else None
                    kwt_checked = ast.literal_eval(
                        checked_result[0]['keyword_trend']
                    ) if 'keyword_trend' in checked_result[0] else None

                else:
                    terms_checked = json.loads("{}")
                    kwt_checked = json.loads("{}")

                terms_checked[term] = total_data[term]
                kwt_checked[term] = final_data[term]

                # Insert if not yet created
                conn = getconf2()
                data = tid, str(
                    terms_checked), f'blogsite_id in ({blog_ids})', str(
                        kwt_checked)
                s.update_insert(
                    "insert into tracker_keyword (tid, terms, query, keyword_trend) values (%s, %s, %s, %s)",
                    data, conn)

                # Update if already created
                data = str(terms_checked), f'blogsite_id in ({blog_ids})', str(
                    kwt_checked), tid
                s.update_insert(
                    "update tracker_keyword set terms = %s, query = %s, keyword_trend = %s where tid = %s",
                    data, conn)

        connection.close()
Example #5
0
    parallel_main = False
    num_processes_main = 6

    # Process updates function
    def process_updates(x):
        from Utils.functions import clean_text, getconf2, updateStatus, getTopKWS, single_process, testingKWT
        num_processes = 24
        update__status = True
        parallel = False
        tid = x['tid']
        print(tid)
        # tid = 450
        testingKWT(tid, '144.167.35.89', parallel, update__status,
                   num_processes)

    conf = getconf2()
    s = SqlFuncs(conf)

    # Get tracker id
    connection = s.get_connection(conf)
    with connection.cursor() as cursor:
        # cursor.execute("select tid from trackers where userid = '*****@*****.**' or YEAR(date_created) in (2019,2020)")
        cursor.execute("select tid from trackers")
        # cursor.execute("SELECT tid FROM blogtrackers.tracker_keyword where DATE(last_modified_time) != CURDATE() or status != 1 or status_percentage != 100")
        # cursor.execute("select tid from tracker_keyword where tid in( select tid from trackers where YEAR(date_created) in (2019,2020) or userid = '*****@*****.**') and DATE(last_modified_time) != CURDATE()")
        # cursor.execute("select tid from trackers where tid not in (select tid from tracker_keyword)")
        records = cursor.fetchall()
    connection.close()

    if parallel_main:
        # pool = Pool(int(6))