def getTopKWS(self, ids): conf = getconf2() s = SqlFuncs(conf) # Get blogsites in tracker connection = s.get_connection(conf) with connection.cursor() as cursor: cursor.execute(f""" select n.term, sum(n.occurr) occurrence from blogpost_terms_api, json_table(terms_test, '$[*]' columns( term varchar(255) path '$.term', occurr int(11) path '$.occurrence' ) ) as n where blogsiteid in ({ids}) group by n.term order by occurrence desc limit 100; """) records = cursor.fetchall() if records: data = [x['term'] for x in records] else: data = None connection.close() cursor.close() return data
def updateStatus(self, status, tid): conn = getconf2() s = SqlFuncs(conn) if int(status) < 100: data = 0, status, tid s.update_insert( "update tracker_keyword set status = %s, status_percentage = %s where tid = %s", data, conn) else: data = 1, status, tid s.update_insert( "update tracker_keyword set status = %s, status_percentage = %s where tid = %s", data, conn)
def testingKWT(self, tid, ip, parallel, update__status, num_processes): conf = getconf2() s = SqlFuncs(conf) # Get blogsites in tracker connection = s.get_connection(conf) with connection.cursor() as cursor: cursor.execute(f"""select * from trackers where tid = {tid}""") records = cursor.fetchall() if records: query = records[0]['query'] if 'blogsite_id in (' in query: blog_ids = query[query.find("(") + 1:query.find(")")] if blog_ids and 'NaN' not in blog_ids: blog_ids = blog_ids[:-1] if ',' == blog_ids[ -1] else blog_ids cursor.execute( f"""select post from blogposts where blogsite_id in ({blog_ids})""" ) records = cursor.fetchall() # Get top terms from blog_ids try: terms_result = getTopKWS(blog_ids) except Exception as e: print(e) try: print('Retrying...') terms_result = getTopKWS(blog_ids) print('success') except Exception as e: terms_result = [] print(e) # Count terms and group by year if terms_result: data_ = [] for term in terms_result: PARAMS = term, blog_ids, tid data_.append(PARAMS) if parallel: print("starting multi-process") process_pool = ProcessPool(num_processes) pbar = tqdm(process_pool.imap( single_process, data_), desc="Terms", ascii=True, file=sys.stdout, total=len(data_)) for x in pbar: pbar.update(1) # Update status on DB if update__status: status = round( (pbar.last_print_n / len(data_)) * 100) if status <= 99 and status >= 90: status = 100 updateStatus(status, tid) process_pool.close() print("Joining pool") process_pool.join() print("Clearing pool") process_pool.clear() print("Finished!") else: for x in data_: single_process(x)
def single_process(self, parameters): from Utils.functions import clean_text, getconf2 from Utils.sql import SqlFuncs import pandas as pd import ast term, blog_ids, tid = parameters # start = time.time() # print('here-2') conf = getconf2() s = SqlFuncs(conf) # Get blogsites in tracker connection = s.get_connection(conf) with connection.cursor() as cursor: cursor.execute(f""" select * from (SELECT date, post,title, blogsite_id, blogpost_id, ROUND ((LENGTH(lower(post)) - LENGTH(REPLACE (lower(post), "{term}", ""))) / LENGTH("{term}")) AS count from (select * from blogposts where match (title, post) against ('{term}' IN BOOLEAN MODE)) bp) a where blogsite_id in ({blog_ids}) """) final_data = {} total_data = {} records = cursor.fetchall() if records: df = pd.DataFrame(records) df['count'] = df['count'].map(int) df['date'] = pd.to_datetime(df['date'], errors='coerce') year_count = df.groupby( df.date.dt.year)['count'].sum().to_dict() final_data[term] = year_count total_data[term] = sum(list(year_count.values())) # check if key already exists in DB cursor.execute( f'select terms, keyword_trend from tracker_keyword where tid = {tid}' ) checked_result = cursor.fetchall() if checked_result: terms_checked = ast.literal_eval( checked_result[0] ['terms']) if 'terms' in checked_result[0] else None kwt_checked = ast.literal_eval( checked_result[0]['keyword_trend'] ) if 'keyword_trend' in checked_result[0] else None else: terms_checked = json.loads("{}") kwt_checked = json.loads("{}") terms_checked[term] = total_data[term] kwt_checked[term] = final_data[term] # Insert if not yet created conn = getconf2() data = tid, str( terms_checked), f'blogsite_id in ({blog_ids})', str( kwt_checked) s.update_insert( "insert into tracker_keyword (tid, terms, query, keyword_trend) values (%s, %s, %s, %s)", data, conn) # Update if already created data = str(terms_checked), f'blogsite_id in ({blog_ids})', str( kwt_checked), tid s.update_insert( "update tracker_keyword set terms = %s, query = %s, keyword_trend = %s where tid = %s", data, conn) connection.close()
parallel_main = False num_processes_main = 6 # Process updates function def process_updates(x): from Utils.functions import clean_text, getconf2, updateStatus, getTopKWS, single_process, testingKWT num_processes = 24 update__status = True parallel = False tid = x['tid'] print(tid) # tid = 450 testingKWT(tid, '144.167.35.89', parallel, update__status, num_processes) conf = getconf2() s = SqlFuncs(conf) # Get tracker id connection = s.get_connection(conf) with connection.cursor() as cursor: # cursor.execute("select tid from trackers where userid = '*****@*****.**' or YEAR(date_created) in (2019,2020)") cursor.execute("select tid from trackers") # cursor.execute("SELECT tid FROM blogtrackers.tracker_keyword where DATE(last_modified_time) != CURDATE() or status != 1 or status_percentage != 100") # cursor.execute("select tid from tracker_keyword where tid in( select tid from trackers where YEAR(date_created) in (2019,2020) or userid = '*****@*****.**') and DATE(last_modified_time) != CURDATE()") # cursor.execute("select tid from trackers where tid not in (select tid from tracker_keyword)") records = cursor.fetchall() connection.close() if parallel_main: # pool = Pool(int(6))