async def semaphoreSafeFetch(sem, row, session, csv_out): """ Simple puts check for semaphore count """ async with sem: try: return await fetchWithRetry(row, session, csv_out) except Exception as e: global FAILED_ASYNC FAILED_ASYNC += 1 # This error is mainly because of: ## 1. [nodename nor servname provided, or not known] ## 2. [Too many open files] pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> <ID = {}><src= {} > NOW = {} Scraping failed. Using Title for Content.... \n \t\t ERROR {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()), e)) if len(row["Content"]) == 0: row["WeightedContent"] = row["Title"] row["Content"] = row["Title"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_TRIED_ERR WRITTEN_ENTRIES_ASYNC_TRIED_ERR += 1 pc.printMsg( " \t\t\t============== [Tried Catch] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) pass return row #NOTE: this f****r!!!
def run(ts): startTime = time.time() try: run_wc(ts) except Exception as e: pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e))) logging.error(traceback.format_exc()) pass try: run_wp(ts) except Exception as e: pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e))) logging.error(traceback.format_exc()) pass endTime = time.time() pc.printSucc("**************************** PopI Calculation is Done for wc & wp ********************************\n\n") pc.printWarn("| \t\t TIME TAKEN FOR PopICalculators-both \t\t | \t\t {} \t\t |".format(round((endTime - startTime),5))) pc.printSucc("*************************************************************************************************\n\n") pc.printSucc("\n\n***************************** PopI Calculation is Complete.************************") print("\n\n") table = PrettyTable(['Entity (Post PopI Calculation)', 'Value']) table.add_row(['TIME TAKEN FOR PopICalculators(wc & wp) (min)', round((endTime - startTime)/60,2)]) pc.printSucc(table) print("\n\n")
def print_tree_horizontally(current_node, balanced_branches, name_getter, indent='', last='updown'): up, down = balanced_branches(current_node) item_len = len(current_node.name) + len(str(current_node.popi)) + len( str(current_node.count)) - 10 if current_node.isTag == False: item_len += 2 """ Printing of "up" branch. """ for child in up: next_last = 'up' if up.index(child) == 0 else '' # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '│', ' ' * (len(current_node.name))) # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '│', ' ' * (item_len)) next_indent = '{0}{1}{2}'.format( indent, ' ' * (item_len) if 'up' in last else '│', ' ' * (item_len)) print_tree_horizontally(child, balanced_branches, name_getter, next_indent, next_last) """ Printing of current node. """ if last == 'up': start_shape = '┌' elif last == 'down': start_shape = '└' elif last == 'updown': start_shape = ' ' else: start_shape = '├' if up: end_shape = '┤' elif down: end_shape = '┐' else: end_shape = '' # print('{0}{1}{2}{3}'.format(indent, start_shape, name_getter(current_node), end_shape)) if current_node.isTag: pc.printMsg('{0}{1}<{2}>(c: {3},p: {4}){5}'.format( indent, start_shape, current_node.name, current_node.count, current_node.popi, end_shape)) else: pc.printErr('{0}{1}[<{2}>](c: {3},p: {4}){5}'.format( indent, start_shape, current_node.name, current_node.count, current_node.popi, end_shape)) """ Printing of "down" branch. """ for child in down: next_last = 'down' if down.index(child) is len(down) - 1 else '' # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '│', ' ' * (len(current_node.name))) # next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '│', ' ' * (item_len)) next_indent = '{0}{1}{2}'.format( indent, ' ' * (item_len) if 'down' in last else '│', ' ' * (item_len)) print_tree_horizontally(child, balanced_branches, name_getter, next_indent, next_last)
async def semaphoreSafeFetch(sem, row, session): """ Simple puts check for semaphore count """ async with sem: try: return await fetchWithRetry(row, session) except Exception as e: global FAILED_ASYNC FAILED_ASYNC += 1 pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> <ID = {}><src= {} > NOW = {} Skipping...Failed due to: \n \t\t ERROR {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()), e)) pass return row #FIXME: this f****r!!!
def return_all_descendents(ts, root): """ Returns all the descendents of node in tag-tree where node.NodeName = root """ pc.printMsg( " \t\t???????????????????????????????????? Query for All Descendents of NodeName = {}" .format(root)) descendents = [] th_db = 'dbs/th.db' th_table = 'th_' + str(int(ts)) conn = sqlite3.connect(th_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < query_children_th: DB Connection Opened > ---------------------------------------------\n" ) q = 'select LeftMptt, RightMptt from ' + th_table + ' where NodeName = ?' root_mptt_values = c.execute(q, ('{}'.format(root), )) root_mptt_values = c.fetchone() if root_mptt_values is None: pc.printErr( " \t\tXXXXXXXXXXXXX-> Asked node with name = {} not found in table = {} \t...... returning NULL as descendents" .format(root, th_table)) return descendents pc.printMsg(" root.LeftMptt = {} , root.RightMptt = {} \n".format( root_mptt_values[0], root_mptt_values[1])) q = 'select * from ' + th_table + ' where LeftMptt > ? AND RightMptt < ?' d = (root_mptt_values[0], root_mptt_values[1]) rows_head = c.execute(q, d) rows = rows_head.fetchall() for row in rows: pc.printWarn(" \t\t * DESCENDENT of {} :: {}".format(root, row)) descendents.append(row) conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < query_children_th: DB Connection Closed > ---------------------------------------------\n" ) return descendents
def return_imm_children(ts, root): """ INPUT: ts, root.name (string) OUTPUT: Returns full row(ID, NodeName, LeftMptt, RightMptt, DepthLevel, ItemCount, AvgPopI, HN_IDs,R_IDs) of just the immediate children of node in tag-tree where node.NodeName = root """ # pc.printMsg(" \t\t ???????????????????????????????????? Query for Immediate Children of NodeName = {}".format(root)) children = [] th_db = 'dbs/th.db' th_table = 'th_' + str(int(ts)) conn = sqlite3.connect(th_db, timeout=10) c = conn.cursor() # pc.printMsg("\t -------------------------------------- < query_children_th: DB Connection Opened > ---------------------------------------------\n") q = 'select LeftMptt, RightMptt, DepthLevel from ' + th_table + ' where NodeName = ? ;' root_mptt_values = c.execute(q, ('{}'.format(root), )) root_mptt_values = c.fetchone() if root_mptt_values is None: pc.printErr( " \t\tXXXXXXXXXXXXX-> Asked node with name = {} not found in table = {} \t...... returning NULL as children" .format(root, th_table)) return children pc.printMsg( " \t ROOT: {} \troot.LeftMptt = {} , root.RightMptt = {} , root.DepthLevel = {}\n" .format(root, root_mptt_values[0], root_mptt_values[1], root_mptt_values[2])) q = 'select * from ' + th_table + ' where LeftMptt > ? AND RightMptt < ? And DepthLevel = ? ' d = (root_mptt_values[0], root_mptt_values[1], root_mptt_values[2] + 1) rows_head = c.execute(q, d) rows = rows_head.fetchall() for row in rows: pc.printWarn(" \t\t * CHILD of {} :: {}".format(root, row[1])) children.append(row) conn.commit() conn.close() # pc.printMsg("\t -------------------------------------- < query_children_th: DB Connection Closed > ---------------------------------------------\n") return children
async def fetchWithRetry(row, session): status = 400 retry_cnt = 3 sleep_time = 10 TIMEOUT = 60 while retry_cnt > 0 and status != 200: async with session.get(row["Url"], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) urlstrings = text_actions.getUrlString(row["Content"]) row["WeightedContent"] = text_actions.clean_text( text_actions.weightedcontentfromhtml(res) + row["Title"] + urlstrings) row["Content"] = text_actions.clean_text( text_actions.contentfromhtml(res) + urlstrings) if (len(row["Content"]) == 0): row["WeightedContent"] = text_actions.clean_text( row["Title"]) row["Content"] = text_actions.clean_text(row["Title"]) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- Unable to hit URL(ERR_CODE={}): {} Sleeping for {} Retries remaining = {} -------------x" .format(status, row["Url"], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , " .format(row["ID"], row["SourceSite"], row["Url"])) global SKIPPED_ASYNC SKIPPED_ASYNC += 1 return row
def run(ts): startTime = time.time() try: update_modelTags(ts) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Tagger Simulator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}" .format(str(e))) logging.error(traceback.format_exc()) pass endTime = time.time() pc.printSucc( "**************************** Tagger(Simulator) Run is Complete for wc **********************************************" ) pc.printWarn( "| \t\t TIME TAKEN FOR Tagger(Simulator) Run(sec) \t\t | \t\t {} \t\t |" .format(round((endTime - startTime), 5))) pc.printSucc( "***********************************************************************************************************************\n\n" )
def run(ts): """ Scrapes PH api for last 7 days & puts data in WP-DB. * Api supports daywaise only. So scrape for one day at a time * Link to documentation: https://api.producthunt.com/v1/docs/posts/posts_index_request_a_specific_day_with_the_%60day%60_parameter_(tech_category) * NOTE: * No threshold set on upvotes or comments rn.Maybe later? * API-Ratelimit: You can make up to 900 requests every 15 minutes, else gives `status 429` in response.If that happens, wait for 16 mins, then hit again. * Retry 2 times; if failed nonetheless, skip! * Content = Tagline * URL: is the PH url only. Going to the product page & then finding the actual link is overkill * (this could also help later on getting their permission while monetizing) * Used self-retry logic. but check this package:: Read about requests.retries here: [doc](https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure), [stkofw](https://stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library?rq=1) Input: ts (format: 1598692058.887741) * ============= row is an array with indices: (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12)) """ wp_db = 'dbs/wp.db' wp_table = 'wp_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started PH-scraper ................... => TABLE: {}\n'. format(datetime.fromtimestamp(ts), wp_table)) conn = sqlite3.connect(wp_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Opened > ---------------------------------------------\n" ) startTime = time.time() """ here is how you add day to `ts`: from datetime import datetime, timedelta newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094 newts.timestamp() # 1598783633.284871 datetime.fromtimestamp(ts) #2020-08-29 17:15:32 # get date from it: datetime.fromtimestamp(ts).date() #2020-08-29 """ """ days_arr has last 7 days(including today's) (YYYY-MM-DD)date strings ; just the way PH's API needs """ curr_date = str(int(ts)) days_arr = [str(datetime.fromtimestamp(int(ts)).date())] # '2020-08-29' for i in range(6): new_ts = datetime.fromtimestamp(int(curr_date)) + timedelta(days=-1) new_ts = new_ts.timestamp() curr_date = new_ts days_arr.append(str(datetime.fromtimestamp(int(new_ts)).date())) PH_REQ_HEADERS = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": "Bearer " + vault.PH_ACCESS_TOKEN, "Host": "api.producthunt.com" } # csv_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wp-db/wp_table_'+str(int(ts))+'.csv' index = gw.WP_TOTAL_ENTRIES_YET + 1 for date in days_arr: pc.printMsg( " ................. scraping for date = {} .................\n". format(date)) url = 'https://api.producthunt.com/v1/posts?day=' + date try: data = web_requests.hitGetWithRetry(url, PH_REQ_HEADERS, False, 2, 5, 10) if (data == -1): pc.printErr( "\t\txxxxxx Unable to hit {} after 2 retries.Skipping this date( {} ) xxxxxx\n" .format(url, date)) else: items_arr = json.loads(data.content)["posts"] for item in items_arr: # print(json.dumps(item, indent = 4)) """ get all the tags attached along with the item """ source_tags = [] for tag in item["topics"]: source_tags.append(tag["name"]) entry = [ index, "PH", datetime.fromtimestamp(ts).date(), int(ts), date_conversion.PHDate(str(item["created_at"])), item["name"], item["discussion_url"], item["thumbnail"]["image_url"], json.dumps(source_tags), item["votes_count"], item["comments_count"], '', item["tagline"] ] # csv_functions.putToCsv(csv_file,entry) c.execute( 'INSERT INTO ' + wp_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index = index + 1 gw.PH_TOTAL_ITEMS_GOT_YET += 1 except Exception as e: pc.printErr( " \t xxxxxxxxxxxxx ERROR@PH_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n" .format(index, e)) logging.error(traceback.format_exc()) pass pc.printMsg("\t\t\t ====>> TOTAL_ENTRIES_YET = {}".format( gw.PH_TOTAL_ITEMS_GOT_YET)) gw.WP_TOTAL_ENTRIES_YET += gw.PH_TOTAL_ITEMS_GOT_YET endTime = time.time() conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** PH Url Scraping is Complete. TABLE: {} ******************" .format(wp_table)) print("\n\n") table = PrettyTable(['Entity (Post PH URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by PH', gw.PH_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WP TABLE YET', gw.WP_TOTAL_ENTRIES_YET]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-PH (sec) ', round((endTime - startTime), 5) ]) pc.printSucc(table) print("\n\n")
def run(ts): """ I. Creates wc_table(in wc.db) & wp_table(in wp.dp) for the week II. Runs following scrapers serially and updates them in WC-DB: 1. hn_scraper.py 2. r_scraper.py 4. ph_scraper.py => Api exists, Scraping not allowed(doint it anyway) 3. ih_scraper.py => No Api, Scraping not allowed(postponed for later) Input: float(timestamp) - set when the main.py run is triggered * float because o/w `datetime.fromtimestamp(ts)` wont run on int Outpu: None, just put data in WC-DB """ startTime = time.time() """ Initialize the weekly content tables in wc.db and wp.db""" wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'" .format(wc_table)) if c.fetchone()[0] == 1: # table exists, flush away! c.execute("delete from {}".format(wc_table)) else: # creting new table c.execute( "CREATE TABLE {} (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, SourceTags,ModelTags,NumUpvotes, NumComments, PopI,WeightedContent,Content)" .format(wc_table)) pc.printSucc( "\n**************************************************** wc_table created => {} **************************************************** \n" .format(wc_table)) wp_db = 'dbs/wp.db' wp_table = 'wp_' + str(int(ts)) conn = sqlite3.connect(wp_db, timeout=10) c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'" .format(wp_table)) if c.fetchone()[0] == 1: # table exists, flush away! c.execute("delete from {}".format(wc_table)) else: # creting new table c.execute('''CREATE TABLE {} (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, ThumbnailUrl,SourceTags,NumUpvotes, NumComments, PopI,Content)''' .format(wp_table)) pc.printSucc( "\n**************************************************** wp_table created => {} **************************************************** \n" .format(wp_table)) """ Run the scrapers sequentially """ pc.printWarn( ". . . . . . . . . . . . . . . ...... Started Running all the scrapers ...... . . . . . . . . . . . . . . .\n" ) try: hn_scraper.run(ts) pc.printSucc( "\n================ HH url scraper run: Complete ================\n" ) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-HN xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}" .format(str(e))) logging.error(traceback.format_exc()) pass try: r_scraper.run(ts) pc.printSucc( " \n================ Reddit url scraper run: Complete ================\n" ) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-Reddit xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}" .format(str(e))) logging.error(traceback.format_exc()) pass try: ph_scraper.run(ts) pc.printSucc( " \n================ PH url scraper run: Complete ================\n" ) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-PH xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}" .format(str(e))) logging.error(traceback.format_exc()) pass # try: # ih_scraper.run(ts) # print(" \n====== IH url scraper run: Complete ======\n") # except Exception as e: # print(" XXXXXXXXXXXX Error in scraping IH for url XXXXXXXXXXXXXXXXX \n \t\tError = {}".format(str(e))) # pass #TODO: add Lobsters here endTime = time.time() pc.printSucc( " ********************************************** URL Scraping(HN,r,PH) is complete *******************************************\n" ) print("\n\n") table = PrettyTable(['Entity (Post all URL Scraping)', 'Value']) table.add_row(['TOTAL URL ITEMS IN WC TABLE ', gw.WC_TOTAL_URL_ENTRIES]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-All (min) ', round((endTime - startTime) / 60, 2) ]) pc.printSucc(table) print("\n\n")
async def fetchWithRetry(conn, row, session, series_count, ts): """ Hits ulr(with retires): * if status == 200: return resposne ((raw)Content & (raw)WeightedContent in row) * if still unable to hit after retries: Content = Title , WeightedContent = Title INPUT: `row` is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ status = 400 retry_cnt = 2 sleep_time = 0.1 t1 = time.time() while retry_cnt > 0 and status != 200: async with session.get(row[6], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=gw.CS_ASYNC_REQ_TIMEOUT) as response: # res = await response.content.read() # returns blob which gives error while ContentFormatter; hence discarded res = await response.text() status = response.status if (status == 200 and len(res) != 0): gw.CS_ASYNC_ITEM_SCRAPED += 1 gw.CS_BOYS_STILL_PLAYING -= 1 pc.printSucc( "\t\t <ID = {}><src= {} > ============== [ASYNCED SCRAPED#{}] Done ....... \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], series_count, round((round((time.time() - t1), 5)), 5), time.strftime("%H:%M:%S", time.localtime()))) row_list = list(row) row_list[13] = res row = tuple(row_list) # wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) # conn = sqlite3.connect(wc_db) # gw.SQL_CONN_OPEN += 1 try: c = conn.cursor() q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?' d = (row[13], row[0], row[1]) c.execute(q, d) pc.printWarn( " \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== " .format(row[0], row[1], gw.SQL_CONN_OPEN)) conn.commit() except Exception as e: pc.printMsg( " \t\t === XXXX ====== <ID= {} ><{}> [ASYNC ContentScraped] \t ERRR in INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== " .format(row[0], row[1], gw.SQL_CONN_OPEN)) logging.error(traceback.format_exc()) pass # conn.close() # gw.SQL_CONN_OPEN -= 1 return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row[0], row[1], status, row[6][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) if series_count == gw.ASYNC_SERIES_CONNECTION: gw.CS_ASYNC_URL_UNREACHABLE += 1 pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... Will try sync later: {} \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) # return row return []
def run(ts): nest_asyncio.apply() # to be able to run async loop from aj async loop wc_table = 'wc_' + str(int(ts)) pc.printMsg( '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n' .format(datetime.fromtimestamp(ts), gw.SEMAPHORE_COUNT, gw.CONNECTION_COUNT, wc_table)) startTime = time.time() """ scrape content in async """ asyncio.get_event_loop().run_until_complete( asyncio.ensure_future(RunAsync(ts))) time.sleep(10) """ scrape remaining items with sync """ RunSync(ts) """ formatting everything in the end-done in sync """ time.sleep(10) ContentFormatting(ts) endTime = time.time() pc.printSucc( "\n\n\n\n\n****************** Content Scraping is Complete , TABLE: {} ********************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Entities (Post Content Scraping-all)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'CS_OUT : ITEMS SCRAPED WITH ASYNC', '[A] (A+B+C=X)', gw.CS_ASYNC_ITEM_SCRAPED ]) table.add_row([ 'CS_OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[B] (A+B+C=X)', gw.CS_ITEMS_WRITTEN_DIRECT ]) table.add_row([ 'CS_OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)', gw.CS_SYNC_ITEM_SCRAPED ]) table.add_row([ 'CF_OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK ]) table.add_row([ 'CF_OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT ]) pc.printSucc(table) pc.printErr( "\n\n------------------------------------------ ERRORS (Written nonetheless, chill) ------------------------------------------------\n" ) table = PrettyTable(['Failures (Post Content Scraping-all)', 'Value']) table.add_row( ['COUNT. UNREACHABLE URLS - ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED SEMA EXCEP. - ASYNC ', gw.CS_ASYNC_SEMA_EXCEPTION_ERR ]) table.add_row( ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED EXCEP. - SYNC ', gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR ]) pc.printErr(table) print("\n") pc.printWarn( '\t\t\t\t------------------------->>>>>> [ Time Taken(min) = {} ]\n\n\n\n\n\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n\n\n")
def ContentFormatting(ts): """ Do: 0. Update Content & WeightedContent column for each row 1. get url_strings_content = getUrlString(row[13]) -> add it in weighted_content 2. do clean_text(row[13]) 2. do clean_text(row[12]) 3. clean text clean_text(row[5]) -> add it in weighted_content :: clean_text(row[12]) + " " + clean_title + " " + url_strings_content 4. if content col is still null; put title into it & in weightedContent too """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < Content Formatter: DB/wc Connection Opened > ---------------------------------------------\n" ) startTime = time.time() pc.printWarn("\tRunning ContentFormatter for wc ....... \t NOW: {}".format( time.strftime("%H:%M:%S", time.localtime()))) pc.printWarn( "\t\t. . . . . . . . . . . .......... Content Formatting Started @Content_Scraper ........... . . . . . . . . . . ." ) signal.signal(signal.SIGALRM, timeout_handler) # timeouts on few function calls, see below q = "select * from " + wc_table rows_head = c.execute(q) rows = rows_head.fetchall() conn.commit() for row in rows: t1 = time.time() row_list = list(row) if (len(row[13]) != 0): gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK += 1 clean_title = clean_text(row_list[5]) if len(row_list[13]) == 0: pc.printWarn( "\t\t\t\t --------- No content found on cleaning, using Title as Content :(" ) row_list[13] = clean_title row_list[12] = clean_title else: raw_content = row_list[13] signal.alarm(200) # Timeout of 200 sec on function call content = clean_title # if timeout happens, this will be the value of content try: content = text_actions.contentfromhtml(raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@ContentFromHtml ! ....using Title as content " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call clean_content = clean_title # if timeout happens, this will be the value of content try: clean_content = clean_text(content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@CleanText ! ....using Title as content " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call weighted_content = clean_title # if timeout happens, this will be the value of content try: weighted_content = text_actions.weightedcontentfromhtml( raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@WeightedContentFromHtml ! ....using Title as weightedcontent " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call clean_weighted_content = clean_title # if timeout happens, this will be the value of content try: clean_weighted_content = clean_text(weighted_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@CleanText ! ....using Title as weightedcontent " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call url_string_text = '' # if timeout happens, this will be the value of content try: url_string_text = getUrlString(raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on URL_STRING@getUrlString ! ....using empty str as url_string_text " .format(row[0], row[1])) # pc.printWarn(exc) pass row_list[13] = clean_content row_list[ 12] = clean_weighted_content + " " + url_string_text + " " + clean_title row = tuple(row_list) pc.printWarn( "\t <ID = {}><src= {} > [Content Formatting] Done................ \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) content = row[13] q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ? where ID = ? and SourceSite = ?' d = (row[13], row[12], row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-with content INSERTED INTO TABLE =============== ".format(row[0],row[1])) else: #No content gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT += 1 pc.printMsg( "\t <ID = {}><src= {} > [Content Formatting] No content.Using title finally................ \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) clean_title = clean_text(row_list[5]) content = clean_title q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ? where ID = ? and SourceSite = ?' d = (content, content, row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-without content INSERTED INTO TABLE =============== ".format(row[0],row[1])) endTime = time.time() conn.close() pc.printMsg( "\t -------------------------------------- < Content Formatter: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Content Formatting is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Success (Post Content Formatting)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK ]) table.add_row([ 'OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT ]) table.add_row([ 'TIME TAKEN - CONTENT FORMATTING (min)', '-', round((endTime - startTime) / 60, 5) ]) pc.printSucc(table) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for Content Formatting (min) = {} ]\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n")
def RunSync(ts): """ NOTE: pdf pages taking a lot of time.Is it right to scrape them still? """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Opened > ---------------------------------------------\n" ) blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4'] q = "select * from " + wc_table + " where length(Content) = 0" rows_head = c.execute(q) rows = rows_head.fetchall() pc.printMsg( "\n\n \t ******************************* ITEMS FOR SYNC TO SCRAPE = {} ******************************\n\n" .format(len(rows))) conn.commit() for row in rows: t1 = time.time() if (len(row[13]) == 0): try: if row[6][-4:] not in blob_pages: response = web_requests.hitGetWithRetry( row[6], '', False, 2, 0.5, 30) if response != -1: gw.CS_SYNC_ITEM_SCRAPED += 1 res = response.text row_list = list(row) row_list[13] = res row = tuple(row_list) pc.printWarn( "\t <ID = {}><src= {} > [SYNCED SCRAPED] Done................ \t\t TimeTaken = {} \t NOW: {} " .format( row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?' d = (row[13], row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [SYNCED SCRAPED] INSERTED INTO TABLE =============== ".format(row[0],row[1])) else: gw.CS_SYNC_URL_UNREACHABLE += 1 pc.printErr( "\t\tXXXXXXXXX [SYNCED SCRAPED]\t SKIPPING... <ID: {}> Totally unable to hit url even in SYNC: {} \t\t TimeTaken = {} \t NOW: {} " .format( row[0], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) else: pc.printMsg( "\t\txxxxx [SYNCED SCRAPED]\t... for ID: {} Found BLOB page SYNC. Will use title. URL: {} \t\t TimeTaken = {} \t NOW: {} " .format(row[0], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) except Exception as e: gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR += 1 pc.printErr( "\t XXXXXXXXXXXXXX [SYNC SCRAPING] XXXX ==>> <ID = {}><src= {} > NOW = {} , \t\t TimeTaken = {} ....Sync Scraping failed too.Will use Title for content... \n \t\t ERROR=> {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()), round((time.time() - t1), 5), e)) # logging.error(traceback.format_exc()) pass endTime = time.time() conn.close() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Sync Content Scraping is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Success (Post Sync Content Scraping)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)', gw.CS_SYNC_ITEM_SCRAPED ]) table.add_row([ 'TIME TAKEN - SYNC CONTENT SCRAPING (min)', '-', round((endTime - startTime) / 60, 5) ]) pc.printSucc(table) pc.printErr( "------------------------------------------ ERRORS-SYNC (Written nonetheless, chill) ------------------------------------------------\n" ) table = PrettyTable(['Failures (Post Sync Content Scraping)', 'Value']) table.add_row( ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED EXCEP. - SYNC ', gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR ]) pc.printErr(table) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for Sync Scraping (min) = {} ]\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n")
async def RunAsync(ts): """ Does ASYNC_SERIES_CONNECTION times number of series executions in parallel """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) """ get rows with content alredy present & put in gw.CS_ITEMS_WRITTEN_DIRECT .Will work just for 1st iteration""" c = conn.cursor() q = "select count(*) from " + wc_table + " where length(Content) != 0" no_scraping_needed_item_count = c.execute(q) no_scraping_needed_item_count = c.fetchone()[0] gw.CS_ITEMS_WRITTEN_DIRECT = no_scraping_needed_item_count conn.commit() # conn.close() # gw.SQL_CONN_OPEN -= 1 for i in range(1, gw.ASYNC_SERIES_CONNECTION + 1): gw.CS_BOYS_STILL_PLAYING = 0 pc.printMsg( "\n\n..........-------------\/\/\/------\/\/\/------\/\/\/---------------............ Running Async for {} -th time - \t Numer of Async-runs remaining: {} \t\t NOW: {}\n\n" .format(i, (gw.ASYNC_SERIES_CONNECTION - i), time.strftime("%H:%M:%S", time.localtime()))) # asyncio.get_event_loop().run_until_complete(asyncio.ensure_future(asyncFetchAll(ts,i))) await asyncFetchAll(conn, ts, i) pc.printMsg( "\t\t..........-------------\/\/\/------............ {} -th Async Running is done.Sleeping for 10 sec now......ZZZZZZZzzzzzzzzz\t\t NOW: {}\n\n" .format(i, time.strftime("%H:%M:%S", time.localtime()))) time.sleep(10) conn.close() endTime = time.time() pc.printSucc( "\n\n***************************** All {} Async Content Scraping is Complete. TABLE: {} ******************" .format(gw.ASYNC_SERIES_CONNECTION, wc_table)) print("\n\n") table = PrettyTable([ 'Success (Post ALL series Async Content Scraping)', 'Notation(if any)', 'Value' ]) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[A] (A+B1+B2+C=X)', gw.CS_ITEMS_WRITTEN_DIRECT ]) table.add_row([ 'OUT : ITEMS SCRAPED WITH ASYNC', '[B] (A+B+C=X)', gw.CS_ASYNC_ITEM_SCRAPED ]) table.add_row([ 'TIME TAKEN - ASYNC CONTENT SCRAPING (min)', '-', round((endTime - startTime) / 60, 2) ]) pc.printSucc(table) pc.printErr( "------------------------------------------ ERRORS-ASYNC (Written nonetheless, chill) ------------------------------------------------\n" ) table = PrettyTable([ 'Failures (Counted as-in last run of Async Content Scraping)', 'Value' ]) table.add_row( ['COUNT. UNREACHABLE URLS in ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED SEMA EXCEP. in ASYNC ', gw.CS_ASYNC_SEMA_EXCEPTION_ERR ]) pc.printErr(table) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-r (min) ', round((endTime - startTime) / 60, 2) ]) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for All {} Sync Scraping (min) = {} ]\n' .format(gw.ASYNC_SERIES_CONNECTION, round((endTime - startTime), 5) / 60)) print("\n\n")
def run(ts): """ Scrapes Algolia's HN api for last 7 days & puts data in WC-DB. * max number of entries in algolia's single api call = 1000. So scrape for one day at a time * Link to documentation: https://hn.algolia.com/api Note: 1. For AskHN entries put `` tag & separate threshold 1. For ShowHN entries put `` tag & separate threshold 1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments Input: ts (format: 1598692058.887741) """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n") startTime = time.time() """ here is how you add day to `ts`: from datetime import datetime, timedelta newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094 newts.timestamp() # 1598783633.284871 datetime.fromtimestamp(ts) #2020-08-29 17:15:32 """ """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format """ ts_arr = [str(int(ts))] for i in range(6): new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1) new_ts = new_ts.timestamp() ts_arr.append(str(int(new_ts))) # for t in ts_arr: # print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t)))) index = gw.WC_TOTAL_URL_ENTRIES + 1 for i in range(len(ts_arr)-1): startepoch = ts_arr[i] endepoch = ts_arr[i+1] pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch)) """ getting stories(articles) with upvotes_count > upvotes_threshold Also including: 1. TellHN (<tech_discuss>) 2. LaunchHN (<startup>) """ pc.printWarn(" \t............. scraping stories .............") try: url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_story) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) sourceTag = '' content = '' sourceSite = 'HN' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) if("Launch HN:" in item["title"]): # 1. LaunchHN sourceTag = 'startup' sourceSite += '/launch' if("Tell HN:" in item["title"]): # 2. TellHN sourceTag = 'tech_discuss' sourceSite += '/tell' else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, sourceTag, '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting ShowHNs """ pc.printWarn("\t............. scraping showHNs .............") try: url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_show) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/show' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'sideproj', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting AskHNs """ pc.printWarn("\t............. scraping askHNs .............") try: url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_ask) res_size = json.loads(data.content)["nbHits"] pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/ask' if(item["url"] is None): #as AskHNs dont have any url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'prog_query', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass endTime = time.time() conn.commit() conn.close() gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n") pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table)) print("\n\n") table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES]) table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)]) pc.printSucc(table) print("\n\n")
def run(ts): """ Get top 1000 submissions of the listed subreddits (max_limit is 1000; should be enough) Hence no use of `ts` here """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started r-scraper ................... => TABLE: {}\n'. format(datetime.fromtimestamp(ts), wc_table)) pc.printMsg( "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n" ) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4'] # these give blob data; no point in scraping them index = gw.WC_TOTAL_URL_ENTRIES + 1 # Setup Client reddit = praw.Reddit( client_id=vault.R_CLIENT_ID, # PERSONAL_USE_SCRIPT_14_CHARS client_secret=vault.R_CLIENT_SECRET, # SECRET_KEY_27_CHARS user_agent=vault.R_USER_AGENT, # YOUR_APP_NAME username=vault.R_USERNAME, # YOUR_REDDIT_USER_NAME password=vault.R_PASSWORD) # YOUR_REDDIT_LOGIN_PASSWORD for subreddit, tag_arr in LIST.items(): try: pc.printWarn( "\t ............ Subreddit@R_UrlScraping : {} .............". format(subreddit)) sr = reddit.subreddit(subreddit) # for submission in sr.top('day',limit=10): # For testing.... # for submission in sr.top('year',limit=1000): #remove this & uncomemnt below line ENTRIES_IN_THIS_SUBRDDIT = 0 for submission in sr.top('week', limit=gw.R_ITEM_LIMIT_PER_SUBREDDIT ): #NOTE: max limit is 1000 #Check1: if the post is unlocked by mods content = '' """ Fixing permalink type urls """ url = submission.url if (url[:2] == '/r'): url = "https://www.reddit.com" + url if (submission.locked == False): #Check2: if post is just an image, discard it if submission.url[ -4:] not in blob_pages: #as reddit currentluy hosts .png & .gif only # if permalink is a substring of url OR submission is a selfpost (text-only) => no need to scrape # NOTE: I know there might be links in post with some discription+link to other article he's reffering; but not worth wasting precious processing time if ((submission.permalink in submission.url) or (submission.is_self == True)): content = submission.selftext entry = [ index, "r/" + subreddit, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.RedditDate( str(datetime.fromtimestamp( submission.created))), submission.title, url, json.dumps(tag_arr), '', submission.score, submission.num_comments, '', '', text_actions.clean_text(content) ] # csv_functions.putToCsv(csv_file,entry) c.execute( 'INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index += 1 ENTRIES_IN_THIS_SUBRDDIT += 1 gw.R_TOTAL_ITEMS_GOT_YET += ENTRIES_IN_THIS_SUBRDDIT pc.printMsg( "\t\t\t\t\t ====> ENTRIES_IN_THIS_SUBRDDIT = {} \t\t | \t gw.R_TOTAL_ITEMS_GOT_YET = {}" .format(ENTRIES_IN_THIS_SUBRDDIT, gw.R_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr( " \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n" .format(index, e)) logging.error(traceback.format_exc()) pass endTime = time.time() gw.WC_TOTAL_URL_ENTRIES += gw.R_TOTAL_ITEMS_GOT_YET conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Reddit Url Scraping is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable(['Entity (Post r URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by HN', gw.R_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-r (min) ', round((endTime - startTime) / 60, 2) ]) pc.printSucc(table) print("\n\n")
def RunSync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url. * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ pc.printMsg( '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n' .format(datetime.fromtimestamp(ts), 'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '.csv' csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '_wc_sync.csv' index = 1 headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_dest_file, headers) f = csv.writer(open(csv_dest_file, "w")) # Flush the old file f.writerow([ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ]) with open(csv_src_file, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 for row in csv_reader: if line_count == 0: print(f'Headers are {", ".join(row)}') line_count += 1 #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]), #add the url-words too text_actions.clean_text(row["Content"]) + text_actions.getUrlString(row["Content"]) ] global WRITTEN_ENTRIES_SYNC WRITTEN_ENTRIES_SYNC += 1 f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) #CHECK2(pre scraping): if(url == NULL)=>discard #CHECK3(pre scraping): if (row["title"]==NULL)=>discard elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)): pc.printWarn( "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) time.sleep(0.0001) try: # response = web_requests.hitGetWithRetry(url,TIMEOUT=10) response = web_requests.hitGetWithRetry( row["Url"], '', False, 2, 0.5, 60) # if response.status_code == 200: if response != -1: # content = text_actions.contentfromhtml(response) #NOTE: for sync content = text_actions.contentfromhtml( response.text) #NOTE: for Async urlstrings = text_actions.getUrlString(content) content += urlstrings #add the url-words too # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too #NOTE: for sync weightedcontent = text_actions.weightedcontentfromhtml( response.text ) + row[ "Title"] + urlstrings #add the url-words too #NOTE: for async line_count += 1 #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"]) if (len(content) == 0): content = row["Title"] weightedcontent = row["Title"] else: entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(weightedcontent), text_actions.clean_text(content) ] f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) pc.printMsg( "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}" .format( row["ID"], time.strftime("%H:%M:%S", time.localtime()))) else: global SKIPPED_SYNC SKIPPED_SYNC += 1 pc.printErr( "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , " .format(row["ID"], row["Url"])) except Exception as e: global FAILED_SYNC FAILED_SYNC += 1 pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()), e)) pass pc.printMsg( "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n" .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) pc.printMsg( "\n----------------------------------------------------------------------------------\n" ) pc.printMsg( "|\tWRITTEN_ENTRIES_SYNC \t | \t {} \t|".format(WRITTEN_ENTRIES_SYNC)) pc.printMsg("|\tSKIPPED_SYNC \t | \t {} \t|".format(SKIPPED_SYNC)) pc.printMsg("|\tFAILED_SYNC \t | \t {} \t|".format(FAILED_SYNC)) pc.printMsg( "\n----------------------------------------------------------------------------------\n" )
async def semaphoreSafeFetch(sem, row, session): """ Simple puts check for semaphore count """ global BOYS_RETURNED_HOME_ALIVE global BOYS_RETURNED_HOME_DEAD # async with sem: # try: # return await fetchWithRetry(row, session) # BOYS_RETURNED_HOME_ALIVE += 1 # print(" \t\t\t\t\t\t\t\t\t\t\t\t BOYS_RETURNED_HOME_ALIVE = {}".format(BOYS_RETURNED_HOME_ALIVE)) # except Exception as e: # BOYS_RETURNED_HOME_DEAD += 1 # print(" \t\t\t\t\t\t\t\t\t\t\t\t BOYS_RETURNED_HOME_DEAD = {}".format(BOYS_RETURNED_HOME_DEAD)) # #TODO: delete this # if str(e).find("codec can't decode byte") != -1: # print(" \t\t\t row: {}".format(row)) # # This error is mainly because of: # ## 1. [nodename nor servname provided, or not known] # ## 2. [Too many open files] => UPDATE: got fixed with using sqlite # pc.printErr("\t======= XXXXXXXX ERROR XXXXXX ======>> <ID = {}><src= {} > NOW = {} Scraping failed. Using Title for Content.... \n \t\t ERROR=> {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime()) ,e)) # logging.error(traceback.format_exc()) # if len(row[13]) == 0: # row_list = list(row) # row_list[12] = row_list[5] # row_list[13] = row_list[5] # row = tuple(row_list) # global ERR_ASYNC_TRIED_ERR # ERR_ASYNC_TRIED_ERR += 1 # pc.printMsg(" \t\t\t============== [Tried Catch] Done Writing into csv for <ID = {}><src= {} > =============== ".format(row[0],row[1])) # pass # return row #NOTE: this f****r!!! async with sem: try: return await fetchWithRetry(row, session) BOYS_RETURNED_HOME_ALIVE += 1 print(" \t\t\t\t\t\t\t\t\t\t\t\t BOYS_RETURNED_HOME_ALIVE = {}". format(BOYS_RETURNED_HOME_ALIVE)) except Exception as e: BOYS_RETURNED_HOME_DEAD += 1 print(" \t\t\t\t\t\t\t\t\t\t\t\t BOYS_RETURNED_HOME_DEAD = {}". format(BOYS_RETURNED_HOME_DEAD)) # #TODO: delete this # if str(e).find("codec can't decode byte") != -1: # print(" \t\t\t row: {}".format(row)) # This error is mainly because of: ## 1. [nodename nor servname provided, or not known] ## 2. [Too many open files] => UPDATE: got fixed with using sqlite pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> <ID = {}><src= {} > NOW = {} Scraping failed. Using Title for Content.... \n \t\t ERROR=> {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()), e)) logging.error(traceback.format_exc()) if len(row[13]) == 0: row_list = list(row) row_list[12] = row_list[5] row_list[13] = row_list[5] row = tuple(row_list) global ERR_ASYNC_TRIED_ERR ERR_ASYNC_TRIED_ERR += 1 pc.printMsg( " \t\t\t============== [Tried Catch] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row[0], row[1])) pass return row #NOTE: this f****r!!!
async def asyncFetchAll(csv_in, csv_out): """ INPUT: csv_src_file & csv_dest_file(to be written) NOTE: * Semaphore limit is: 500 * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time """ tasks = [] sem = asyncio.Semaphore(1000) """ Initialize the output file """ headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_out, headers) connector = TCPConnector(limit=0) async with ClientSession(headers={'Connection': 'keep-alive'}, connector=connector) as session: with open(csv_in, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 global ENTRIES_TO_BE_WRITTEN for row in csv_reader: ENTRIES_TO_BE_WRITTEN += 1 if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) row["WeightedContent"] = text_actions.clean_text( row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]) row["Content"] = text_actions.clean_text( row["Content"]) + text_actions.getUrlString( row["Content"]) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], row["Content"], row["WeightedContent"], ] csv_functions.putToCsv(csv_out, entry) global WRITTEN_ENTRIES_ASYNC_DIRECT WRITTEN_ENTRIES_ASYNC_DIRECT += 1 pc.printMsg( " \t\t ============== Done Writing into csv for <ID = {}><src= {} >=============== " .format(row["ID"], row["SourceSite"])) elif (row["Url"] and row["Title"]): task = asyncio.ensure_future( semaphoreSafeFetch(sem, row, session)) tasks.append(task) responses = await asyncio.gather(*tasks) pc.printMsg( "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ len(responses):: to be scraped = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n" .format(len(responses))) for row in responses: if row["Content"] or row["Title"]: if len( row["Content"] ) == 0: # that means url was hit successfully and content was generated row["Content"] = row["Title"] entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], row["Content"], row["WeightedContent"], ] await write_result(csv_out, entry) # csv_functions.putToCsv(csv_out, entry) global WRITTEN_ENTRIES_ASYNC_SCRAPED WRITTEN_ENTRIES_ASYNC_SCRAPED += 1 pc.printMsg( " \t\t ============== Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) else: pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx Skipping for <ID = {}><src= {} > As No Content & Title xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row["ID"], row["SourceSite"]))
async def fetchWithRetry(row, session, csv_out): """ Hits ulr(with retires): * if status == 200: put content into csv * if still unable to hit after retries: Content = Title , WeightedContent = Title """ status = 400 retry_cnt = 2 sleep_time = 10 TIMEOUT = 10 while retry_cnt > 0 and status != 200: async with session.get(row["Url"], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) urlstrings = text_actions.getUrlString(row["Content"]) row["WeightedContent"] = text_actions.weightedcontentfromhtml( res) + row["Title"] + urlstrings row["Content"] = text_actions.contentfromhtml(res) + urlstrings # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) if (len(row["Title"]) != 0): if len(row["Content"]) == 0: row["WeightedContent"] = row["Title"] row["Content"] = row["Title"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_SCRAPED WRITTEN_ENTRIES_ASYNC_SCRAPED += 1 pc.printMsg( " \t\t ============== [Scraped] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) else: global WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING += 1 pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING for <ID = {}><src= {} > As No Title xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row["ID"], row["SourceSite"])) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row["ID"], row["SourceSite"], status, row["Url"][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} " .format(row["ID"], row["SourceSite"], row["Url"])) if len(row["Content"]) == 0: row["WeightedContent"] = row["Title"] row["Content"] = row["Title"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR += 1 pc.printMsg( " \t\t\t ============== [Unreachable URL] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) return row
async def fetchWithRetry(row, session): """ Hits ulr(with retires): * if status == 200: return resposne ((raw)Content & (raw)WeightedContent in row) * if still unable to hit after retries: Content = Title , WeightedContent = Title INPUT: `row` is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ status = 400 retry_cnt = 2 sleep_time = 5 # TIMEOUT = ClientTimeout(total=20) TIMEOUT = 20 while retry_cnt > 0 and status != 200: async with session.get(row[6], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() # res = await response.content.read() # res = await text_actions.clean_text(str(response.content.read())) res = text_actions.clean_text(str(res)) # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore') #FIXME: not working status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()))) row_list = list(row) row_list[12] = text_actions.weightedcontentfromhtml(res) row_list[13] = text_actions.contentfromhtml(res) # for i in range(len(row_list)): # row_list[i] = row_list[i].decode("utf-8", "ignore") row = tuple(row_list) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) if (len(row[13]) == 0): global ERR_ASYNC_NO_CONTENT_IN_SCRAPING ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1 pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row[0], row[1])) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row[0], row[1], status, row[6][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} " .format(row[0], row[1], row[6])) global ERR_ASYNC_ON_URL_ERROR ERR_ASYNC_ON_URL_ERROR += 1 pc.printMsg( " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== " .format(row[0], row[1])) return row
def RunAsync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url- ASYNCLY * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ global CONNTECTION_COUNT, SEMAPHORE_COUNT wc_table = 'wc_' + str(int(ts)) pc.printMsg( '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n' .format(datetime.fromtimestamp(ts), SEMAPHORE_COUNT, CONNTECTION_COUNT, wc_table)) stratTime = time.time() # csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'.csv' # csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'_wc.csv' # Run the async job asyncio.get_event_loop().run_until_complete( asyncio.ensure_future(asyncFetchAll(ts))) endTime = time.time() pc.printSucc( "\n****************** (Async)Content Scraping is Complete , TABLE: {} ********************" .format(wc_table)) pc.printMsg( "\n--------------------------------------------------------------------------------------------------------------------------------" ) pc.printMsg( "|\t\t IN : Total Entries in Url-Scraped Output Table \t\t | \t\t {} \t\t|" .format(ENTRIES_TO_BE_WRITTEN)) pc.printMsg( "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_DIRECT(content exists) \t\t | \t\t {} \t\t|" .format(WRITTEN_ENTRIES_ASYNC_DIRECT)) pc.printMsg( "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_SCRAPED(scraped entries) \t\t | \t\t {} \t\t|" .format(WRITTEN_ENTRIES_ASYNC_SCRAPED)) pc.printErr( "\n\n------------------ ERRORS In Scraping (Written nonetheless; counted in WRITTEN_ENTRIES_ASYNC_SCRAPED) --------------------------\n" ) pc.printMsg( "=================================================================================================================================" ) pc.printErr( "|\t\t ERR_ASYNC_NO_CONTENT_IN_SCRAPING(url hit;not content-written ) \t\t | \t\t {} \t\t|" .format(ERR_ASYNC_NO_CONTENT_IN_SCRAPING)) pc.printErr( "|\t\t ERR_ASYNC_ON_URL_ERROR(url not hit) \t\t | \t\t {} \t\t|" .format(ERR_ASYNC_ON_URL_ERROR)) pc.printErr( "|\t\t ERR_ASYNC_TRIED_ERR(other try/catch errs) \t\t | \t\t {} \t\t|" .format(ERR_ASYNC_TRIED_ERR)) pc.printMsg( "---------------------------------------------------------------------------------------------------------------------------------\n" ) pc.printWarn( '\t\t\t\t------------------------->>>>>> [ Semaphore Count = {}, Tcp connector limit ={} ]\n' .format(SEMAPHORE_COUNT, CONNTECTION_COUNT)) pc.printWarn( '\t\t\t\t------------------------->>>>>> [ Time Taken(sec) = {} ]\n'. format(int(endTime - stratTime)))