def RunSync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url. * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ pc.printMsg( '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n' .format(datetime.fromtimestamp(ts), 'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '.csv' csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '_wc_sync.csv' index = 1 headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_dest_file, headers) f = csv.writer(open(csv_dest_file, "w")) # Flush the old file f.writerow([ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ]) with open(csv_src_file, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 for row in csv_reader: if line_count == 0: print(f'Headers are {", ".join(row)}') line_count += 1 #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]), #add the url-words too text_actions.clean_text(row["Content"]) + text_actions.getUrlString(row["Content"]) ] global WRITTEN_ENTRIES_SYNC WRITTEN_ENTRIES_SYNC += 1 f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) #CHECK2(pre scraping): if(url == NULL)=>discard #CHECK3(pre scraping): if (row["title"]==NULL)=>discard elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)): pc.printWarn( "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) time.sleep(0.0001) try: # response = web_requests.hitGetWithRetry(url,TIMEOUT=10) response = web_requests.hitGetWithRetry( row["Url"], '', False, 2, 0.5, 60) # if response.status_code == 200: if response != -1: # content = text_actions.contentfromhtml(response) #NOTE: for sync content = text_actions.contentfromhtml( response.text) #NOTE: for Async urlstrings = text_actions.getUrlString(content) content += urlstrings #add the url-words too # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too #NOTE: for sync weightedcontent = text_actions.weightedcontentfromhtml( response.text ) + row[ "Title"] + urlstrings #add the url-words too #NOTE: for async line_count += 1 #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"]) if (len(content) == 0): content = row["Title"] weightedcontent = row["Title"] else: entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(weightedcontent), text_actions.clean_text(content) ] f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) pc.printMsg( "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}" .format( row["ID"], time.strftime("%H:%M:%S", time.localtime()))) else: global SKIPPED_SYNC SKIPPED_SYNC += 1 pc.printErr( "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , " .format(row["ID"], row["Url"])) except Exception as e: global FAILED_SYNC FAILED_SYNC += 1 pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()), e)) pass pc.printMsg( "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n" .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) pc.printMsg( "\n----------------------------------------------------------------------------------\n" ) pc.printMsg( "|\tWRITTEN_ENTRIES_SYNC \t | \t {} \t|".format(WRITTEN_ENTRIES_SYNC)) pc.printMsg("|\tSKIPPED_SYNC \t | \t {} \t|".format(SKIPPED_SYNC)) pc.printMsg("|\tFAILED_SYNC \t | \t {} \t|".format(FAILED_SYNC)) pc.printMsg( "\n----------------------------------------------------------------------------------\n" )
def RunSync(ts): """ NOTE: pdf pages taking a lot of time.Is it right to scrape them still? """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Opened > ---------------------------------------------\n" ) blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4'] q = "select * from " + wc_table + " where length(Content) = 0" rows_head = c.execute(q) rows = rows_head.fetchall() pc.printMsg( "\n\n \t ******************************* ITEMS FOR SYNC TO SCRAPE = {} ******************************\n\n" .format(len(rows))) conn.commit() for row in rows: t1 = time.time() if (len(row[13]) == 0): try: if row[6][-4:] not in blob_pages: response = web_requests.hitGetWithRetry( row[6], '', False, 2, 0.5, 30) if response != -1: gw.CS_SYNC_ITEM_SCRAPED += 1 res = response.text row_list = list(row) row_list[13] = res row = tuple(row_list) pc.printWarn( "\t <ID = {}><src= {} > [SYNCED SCRAPED] Done................ \t\t TimeTaken = {} \t NOW: {} " .format( row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?' d = (row[13], row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [SYNCED SCRAPED] INSERTED INTO TABLE =============== ".format(row[0],row[1])) else: gw.CS_SYNC_URL_UNREACHABLE += 1 pc.printErr( "\t\tXXXXXXXXX [SYNCED SCRAPED]\t SKIPPING... <ID: {}> Totally unable to hit url even in SYNC: {} \t\t TimeTaken = {} \t NOW: {} " .format( row[0], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) else: pc.printMsg( "\t\txxxxx [SYNCED SCRAPED]\t... for ID: {} Found BLOB page SYNC. Will use title. URL: {} \t\t TimeTaken = {} \t NOW: {} " .format(row[0], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) except Exception as e: gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR += 1 pc.printErr( "\t XXXXXXXXXXXXXX [SYNC SCRAPING] XXXX ==>> <ID = {}><src= {} > NOW = {} , \t\t TimeTaken = {} ....Sync Scraping failed too.Will use Title for content... \n \t\t ERROR=> {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()), round((time.time() - t1), 5), e)) # logging.error(traceback.format_exc()) pass endTime = time.time() conn.close() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Sync Content Scraping is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Success (Post Sync Content Scraping)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)', gw.CS_SYNC_ITEM_SCRAPED ]) table.add_row([ 'TIME TAKEN - SYNC CONTENT SCRAPING (min)', '-', round((endTime - startTime) / 60, 5) ]) pc.printSucc(table) pc.printErr( "------------------------------------------ ERRORS-SYNC (Written nonetheless, chill) ------------------------------------------------\n" ) table = PrettyTable(['Failures (Post Sync Content Scraping)', 'Value']) table.add_row( ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED EXCEP. - SYNC ', gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR ]) pc.printErr(table) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for Sync Scraping (min) = {} ]\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n")
def run(ts): """ Scrapes PH api for last 7 days & puts data in WP-DB. * Api supports daywaise only. So scrape for one day at a time * Link to documentation: https://api.producthunt.com/v1/docs/posts/posts_index_request_a_specific_day_with_the_%60day%60_parameter_(tech_category) * NOTE: * No threshold set on upvotes or comments rn.Maybe later? * API-Ratelimit: You can make up to 900 requests every 15 minutes, else gives `status 429` in response.If that happens, wait for 16 mins, then hit again. * Retry 2 times; if failed nonetheless, skip! * Content = Tagline * URL: is the PH url only. Going to the product page & then finding the actual link is overkill * (this could also help later on getting their permission while monetizing) * Used self-retry logic. but check this package:: Read about requests.retries here: [doc](https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure), [stkofw](https://stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library?rq=1) Input: ts (format: 1598692058.887741) * ============= row is an array with indices: (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12)) """ wp_db = 'dbs/wp.db' wp_table = 'wp_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started PH-scraper ................... => TABLE: {}\n'. format(datetime.fromtimestamp(ts), wp_table)) conn = sqlite3.connect(wp_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Opened > ---------------------------------------------\n" ) startTime = time.time() """ here is how you add day to `ts`: from datetime import datetime, timedelta newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094 newts.timestamp() # 1598783633.284871 datetime.fromtimestamp(ts) #2020-08-29 17:15:32 # get date from it: datetime.fromtimestamp(ts).date() #2020-08-29 """ """ days_arr has last 7 days(including today's) (YYYY-MM-DD)date strings ; just the way PH's API needs """ curr_date = str(int(ts)) days_arr = [str(datetime.fromtimestamp(int(ts)).date())] # '2020-08-29' for i in range(6): new_ts = datetime.fromtimestamp(int(curr_date)) + timedelta(days=-1) new_ts = new_ts.timestamp() curr_date = new_ts days_arr.append(str(datetime.fromtimestamp(int(new_ts)).date())) PH_REQ_HEADERS = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": "Bearer " + vault.PH_ACCESS_TOKEN, "Host": "api.producthunt.com" } # csv_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wp-db/wp_table_'+str(int(ts))+'.csv' index = gw.WP_TOTAL_ENTRIES_YET + 1 for date in days_arr: pc.printMsg( " ................. scraping for date = {} .................\n". format(date)) url = 'https://api.producthunt.com/v1/posts?day=' + date try: data = web_requests.hitGetWithRetry(url, PH_REQ_HEADERS, False, 2, 5, 10) if (data == -1): pc.printErr( "\t\txxxxxx Unable to hit {} after 2 retries.Skipping this date( {} ) xxxxxx\n" .format(url, date)) else: items_arr = json.loads(data.content)["posts"] for item in items_arr: # print(json.dumps(item, indent = 4)) """ get all the tags attached along with the item """ source_tags = [] for tag in item["topics"]: source_tags.append(tag["name"]) entry = [ index, "PH", datetime.fromtimestamp(ts).date(), int(ts), date_conversion.PHDate(str(item["created_at"])), item["name"], item["discussion_url"], item["thumbnail"]["image_url"], json.dumps(source_tags), item["votes_count"], item["comments_count"], '', item["tagline"] ] # csv_functions.putToCsv(csv_file,entry) c.execute( 'INSERT INTO ' + wp_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index = index + 1 gw.PH_TOTAL_ITEMS_GOT_YET += 1 except Exception as e: pc.printErr( " \t xxxxxxxxxxxxx ERROR@PH_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n" .format(index, e)) logging.error(traceback.format_exc()) pass pc.printMsg("\t\t\t ====>> TOTAL_ENTRIES_YET = {}".format( gw.PH_TOTAL_ITEMS_GOT_YET)) gw.WP_TOTAL_ENTRIES_YET += gw.PH_TOTAL_ITEMS_GOT_YET endTime = time.time() conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** PH Url Scraping is Complete. TABLE: {} ******************" .format(wp_table)) print("\n\n") table = PrettyTable(['Entity (Post PH URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by PH', gw.PH_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WP TABLE YET', gw.WP_TOTAL_ENTRIES_YET]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-PH (sec) ', round((endTime - startTime), 5) ]) pc.printSucc(table) print("\n\n")
def run(ts): """ Scrapes Algolia's HN api for last 7 days & puts data in WC-DB. * max number of entries in algolia's single api call = 1000. So scrape for one day at a time * Link to documentation: https://hn.algolia.com/api Note: 1. For AskHN entries put `` tag & separate threshold 1. For ShowHN entries put `` tag & separate threshold 1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments Input: ts (format: 1598692058.887741) """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n") startTime = time.time() """ here is how you add day to `ts`: from datetime import datetime, timedelta newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094 newts.timestamp() # 1598783633.284871 datetime.fromtimestamp(ts) #2020-08-29 17:15:32 """ """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format """ ts_arr = [str(int(ts))] for i in range(6): new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1) new_ts = new_ts.timestamp() ts_arr.append(str(int(new_ts))) # for t in ts_arr: # print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t)))) index = gw.WC_TOTAL_URL_ENTRIES + 1 for i in range(len(ts_arr)-1): startepoch = ts_arr[i] endepoch = ts_arr[i+1] pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch)) """ getting stories(articles) with upvotes_count > upvotes_threshold Also including: 1. TellHN (<tech_discuss>) 2. LaunchHN (<startup>) """ pc.printWarn(" \t............. scraping stories .............") try: url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_story) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) sourceTag = '' content = '' sourceSite = 'HN' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) if("Launch HN:" in item["title"]): # 1. LaunchHN sourceTag = 'startup' sourceSite += '/launch' if("Tell HN:" in item["title"]): # 2. TellHN sourceTag = 'tech_discuss' sourceSite += '/tell' else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, sourceTag, '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting ShowHNs """ pc.printWarn("\t............. scraping showHNs .............") try: url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_show) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/show' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'sideproj', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting AskHNs """ pc.printWarn("\t............. scraping askHNs .............") try: url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_ask) res_size = json.loads(data.content)["nbHits"] pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/ask' if(item["url"] is None): #as AskHNs dont have any url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'prog_query', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass endTime = time.time() conn.commit() conn.close() gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n") pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table)) print("\n\n") table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES]) table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)]) pc.printSucc(table) print("\n\n")