async def fetchWithRetry(row, session): status = 400 retry_cnt = 3 sleep_time = 10 TIMEOUT = 60 while retry_cnt > 0 and status != 200: async with session.get(row["Url"], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) urlstrings = text_actions.getUrlString(row["Content"]) row["WeightedContent"] = text_actions.clean_text( text_actions.weightedcontentfromhtml(res) + row["Title"] + urlstrings) row["Content"] = text_actions.clean_text( text_actions.contentfromhtml(res) + urlstrings) if (len(row["Content"]) == 0): row["WeightedContent"] = text_actions.clean_text( row["Title"]) row["Content"] = text_actions.clean_text(row["Title"]) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- Unable to hit URL(ERR_CODE={}): {} Sleeping for {} Retries remaining = {} -------------x" .format(status, row["Url"], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , " .format(row["ID"], row["SourceSite"], row["Url"])) global SKIPPED_ASYNC SKIPPED_ASYNC += 1 return row
async def asyncFetchAll(csv_in, csv_out): """ INPUT: csv_src_file & csv_dest_file(to be written) NOTE: * Semaphore limit is: 500 * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time """ tasks = [] sem = asyncio.Semaphore(1000) """ Initialize the output file """ headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_out, headers) connector = TCPConnector(limit=0) async with ClientSession(headers={'Connection': 'keep-alive'}, connector=connector) as session: with open(csv_in, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 global ENTRIES_TO_BE_WRITTEN for row in csv_reader: ENTRIES_TO_BE_WRITTEN += 1 if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) row["WeightedContent"] = text_actions.clean_text( row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]) row["Content"] = text_actions.clean_text( row["Content"]) + text_actions.getUrlString( row["Content"]) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], row["Content"], row["WeightedContent"], ] csv_functions.putToCsv(csv_out, entry) global WRITTEN_ENTRIES_ASYNC_DIRECT WRITTEN_ENTRIES_ASYNC_DIRECT += 1 pc.printMsg( " \t\t ============== Done Writing into csv for <ID = {}><src= {} >=============== " .format(row["ID"], row["SourceSite"])) elif (row["Url"] and row["Title"]): task = asyncio.ensure_future( semaphoreSafeFetch(sem, row, session)) tasks.append(task) responses = await asyncio.gather(*tasks) pc.printMsg( "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ len(responses):: to be scraped = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n" .format(len(responses))) for row in responses: if row["Content"] or row["Title"]: if len( row["Content"] ) == 0: # that means url was hit successfully and content was generated row["Content"] = row["Title"] entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], row["Content"], row["WeightedContent"], ] await write_result(csv_out, entry) # csv_functions.putToCsv(csv_out, entry) global WRITTEN_ENTRIES_ASYNC_SCRAPED WRITTEN_ENTRIES_ASYNC_SCRAPED += 1 pc.printMsg( " \t\t ============== Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) else: pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx Skipping for <ID = {}><src= {} > As No Content & Title xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row["ID"], row["SourceSite"]))
def RunSync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url. * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ pc.printMsg( '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n' .format(datetime.fromtimestamp(ts), 'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '.csv' csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '_wc_sync.csv' index = 1 headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_dest_file, headers) f = csv.writer(open(csv_dest_file, "w")) # Flush the old file f.writerow([ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ]) with open(csv_src_file, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 for row in csv_reader: if line_count == 0: print(f'Headers are {", ".join(row)}') line_count += 1 #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]), #add the url-words too text_actions.clean_text(row["Content"]) + text_actions.getUrlString(row["Content"]) ] global WRITTEN_ENTRIES_SYNC WRITTEN_ENTRIES_SYNC += 1 f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) #CHECK2(pre scraping): if(url == NULL)=>discard #CHECK3(pre scraping): if (row["title"]==NULL)=>discard elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)): pc.printWarn( "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) time.sleep(0.0001) try: # response = web_requests.hitGetWithRetry(url,TIMEOUT=10) response = web_requests.hitGetWithRetry( row["Url"], '', False, 2, 0.5, 60) # if response.status_code == 200: if response != -1: # content = text_actions.contentfromhtml(response) #NOTE: for sync content = text_actions.contentfromhtml( response.text) #NOTE: for Async urlstrings = text_actions.getUrlString(content) content += urlstrings #add the url-words too # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too #NOTE: for sync weightedcontent = text_actions.weightedcontentfromhtml( response.text ) + row[ "Title"] + urlstrings #add the url-words too #NOTE: for async line_count += 1 #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"]) if (len(content) == 0): content = row["Title"] weightedcontent = row["Title"] else: entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(weightedcontent), text_actions.clean_text(content) ] f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) pc.printMsg( "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}" .format( row["ID"], time.strftime("%H:%M:%S", time.localtime()))) else: global SKIPPED_SYNC SKIPPED_SYNC += 1 pc.printErr( "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , " .format(row["ID"], row["Url"])) except Exception as e: global FAILED_SYNC FAILED_SYNC += 1 pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()), e)) pass pc.printMsg( "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n" .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) pc.printMsg( "\n----------------------------------------------------------------------------------\n" ) pc.printMsg( "|\tWRITTEN_ENTRIES_SYNC \t | \t {} \t|".format(WRITTEN_ENTRIES_SYNC)) pc.printMsg("|\tSKIPPED_SYNC \t | \t {} \t|".format(SKIPPED_SYNC)) pc.printMsg("|\tFAILED_SYNC \t | \t {} \t|".format(FAILED_SYNC)) pc.printMsg( "\n----------------------------------------------------------------------------------\n" )
def cleanNcheckAsyncOutput(csv_in, csv_out): """ Analyse the created & input fles Also, cleans Content & WeightedContent-> put in new file, delete the old one Variables: * NO_LINES_IN_INPUT_CSV * NO_LINES_IN_OUTPUT_CSV * NO_LINES_IN_OUTPUT_WITHOUT_TITLE * NO_LINES_IN_OUTPUT_WITHOUT_URL * NO_LINES_IN_OUTPUT_WITHOUT_CONTENT """ f = open(csv_in, "r+") f.fseek(0) # reach to first line reader = csv.reader(f) NO_LINES_IN_INPUT_CSV = len(list(reader)) """ Now check and create new "cleaned" file """ headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_final_out = os.path.join("F", csv_out) csv_functions.creteCsvFile(csv_final_out, headers) pc.prCyan( " ========================== NOW CREATING FINAL OUTPUT FILE: {} ===========================" .format(csv_final_out)) line_count = 0 with open(csv_out, mode='r') as r, open(csv_final_out, 'a+', newline='') as f: reader = csv.DictReader(r) writer = csv.writer(f) NO_LINES_IN_OUTPUT_CSV = 0 for row in reader: if (line_count == 0): # skipping headers line_count += 1 else: url_string_content = text_actions.getUrlString(row["Content"]) content = text_actions.clean_text(row["Content"]) weighted_content = text_actions.clean_text( row["WeightedContent"]) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], weighted_content + url_string_content, content, ] writer.writerow(entry) NO_LINES_IN_OUTPUT_CSV += 1 if (len(row["Title"]) == 0): NO_LINES_IN_OUTPUT_WITHOUT_TITLE += 1 if (len(row["Url"]) == 0): NO_LINES_IN_OUTPUT_WITHOUT_URL += 1 if (len(row["Content"]) == 0): NO_LINES_IN_OUTPUT_WITHOUT_CONTENT += 1 #TODO: os.remove(csv_in) %% rename pc.printWarn( "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~ Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" ) pc.printWarn( "|\t NO_LINES_IN_INPUT_CSV \t | \t {} \t|".format( NO_LINES_IN_INPUT_CSV)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_CSV \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_CSV)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_WITHOUT_TITLE \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_WITHOUT_TITLE)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_WITHOUT_URL \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_WITHOUT_URL)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_WITHOUT_CONTENT \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_WITHOUT_CONTENT))
def run(ts): """ Get top 1000 submissions of the listed subreddits (max_limit is 1000; should be enough) Hence no use of `ts` here """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started r-scraper ................... => TABLE: {}\n'. format(datetime.fromtimestamp(ts), wc_table)) pc.printMsg( "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n" ) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4'] # these give blob data; no point in scraping them index = gw.WC_TOTAL_URL_ENTRIES + 1 # Setup Client reddit = praw.Reddit( client_id=vault.R_CLIENT_ID, # PERSONAL_USE_SCRIPT_14_CHARS client_secret=vault.R_CLIENT_SECRET, # SECRET_KEY_27_CHARS user_agent=vault.R_USER_AGENT, # YOUR_APP_NAME username=vault.R_USERNAME, # YOUR_REDDIT_USER_NAME password=vault.R_PASSWORD) # YOUR_REDDIT_LOGIN_PASSWORD for subreddit, tag_arr in LIST.items(): try: pc.printWarn( "\t ............ Subreddit@R_UrlScraping : {} .............". format(subreddit)) sr = reddit.subreddit(subreddit) # for submission in sr.top('day',limit=10): # For testing.... # for submission in sr.top('year',limit=1000): #remove this & uncomemnt below line ENTRIES_IN_THIS_SUBRDDIT = 0 for submission in sr.top('week', limit=gw.R_ITEM_LIMIT_PER_SUBREDDIT ): #NOTE: max limit is 1000 #Check1: if the post is unlocked by mods content = '' """ Fixing permalink type urls """ url = submission.url if (url[:2] == '/r'): url = "https://www.reddit.com" + url if (submission.locked == False): #Check2: if post is just an image, discard it if submission.url[ -4:] not in blob_pages: #as reddit currentluy hosts .png & .gif only # if permalink is a substring of url OR submission is a selfpost (text-only) => no need to scrape # NOTE: I know there might be links in post with some discription+link to other article he's reffering; but not worth wasting precious processing time if ((submission.permalink in submission.url) or (submission.is_self == True)): content = submission.selftext entry = [ index, "r/" + subreddit, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.RedditDate( str(datetime.fromtimestamp( submission.created))), submission.title, url, json.dumps(tag_arr), '', submission.score, submission.num_comments, '', '', text_actions.clean_text(content) ] # csv_functions.putToCsv(csv_file,entry) c.execute( 'INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index += 1 ENTRIES_IN_THIS_SUBRDDIT += 1 gw.R_TOTAL_ITEMS_GOT_YET += ENTRIES_IN_THIS_SUBRDDIT pc.printMsg( "\t\t\t\t\t ====> ENTRIES_IN_THIS_SUBRDDIT = {} \t\t | \t gw.R_TOTAL_ITEMS_GOT_YET = {}" .format(ENTRIES_IN_THIS_SUBRDDIT, gw.R_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr( " \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n" .format(index, e)) logging.error(traceback.format_exc()) pass endTime = time.time() gw.WC_TOTAL_URL_ENTRIES += gw.R_TOTAL_ITEMS_GOT_YET conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Reddit Url Scraping is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable(['Entity (Post r URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by HN', gw.R_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-r (min) ', round((endTime - startTime) / 60, 2) ]) pc.printSucc(table) print("\n\n")
def run(ts): """ Scrapes Algolia's HN api for last 7 days & puts data in WC-DB. * max number of entries in algolia's single api call = 1000. So scrape for one day at a time * Link to documentation: https://hn.algolia.com/api Note: 1. For AskHN entries put `` tag & separate threshold 1. For ShowHN entries put `` tag & separate threshold 1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments Input: ts (format: 1598692058.887741) """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n") startTime = time.time() """ here is how you add day to `ts`: from datetime import datetime, timedelta newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094 newts.timestamp() # 1598783633.284871 datetime.fromtimestamp(ts) #2020-08-29 17:15:32 """ """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format """ ts_arr = [str(int(ts))] for i in range(6): new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1) new_ts = new_ts.timestamp() ts_arr.append(str(int(new_ts))) # for t in ts_arr: # print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t)))) index = gw.WC_TOTAL_URL_ENTRIES + 1 for i in range(len(ts_arr)-1): startepoch = ts_arr[i] endepoch = ts_arr[i+1] pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch)) """ getting stories(articles) with upvotes_count > upvotes_threshold Also including: 1. TellHN (<tech_discuss>) 2. LaunchHN (<startup>) """ pc.printWarn(" \t............. scraping stories .............") try: url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_story) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) sourceTag = '' content = '' sourceSite = 'HN' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) if("Launch HN:" in item["title"]): # 1. LaunchHN sourceTag = 'startup' sourceSite += '/launch' if("Tell HN:" in item["title"]): # 2. TellHN sourceTag = 'tech_discuss' sourceSite += '/tell' else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, sourceTag, '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting ShowHNs """ pc.printWarn("\t............. scraping showHNs .............") try: url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_show) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/show' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'sideproj', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting AskHNs """ pc.printWarn("\t............. scraping askHNs .............") try: url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_ask) res_size = json.loads(data.content)["nbHits"] pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/ask' if(item["url"] is None): #as AskHNs dont have any url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'prog_query', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass endTime = time.time() conn.commit() conn.close() gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n") pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table)) print("\n\n") table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES]) table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)]) pc.printSucc(table) print("\n\n")
async def fetchWithRetry(row, session): """ Hits ulr(with retires): * if status == 200: return resposne ((raw)Content & (raw)WeightedContent in row) * if still unable to hit after retries: Content = Title , WeightedContent = Title INPUT: `row` is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ status = 400 retry_cnt = 2 sleep_time = 5 # TIMEOUT = ClientTimeout(total=20) TIMEOUT = 20 while retry_cnt > 0 and status != 200: async with session.get(row[6], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() # res = await response.content.read() # res = await text_actions.clean_text(str(response.content.read())) res = text_actions.clean_text(str(res)) # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore') #FIXME: not working status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()))) row_list = list(row) row_list[12] = text_actions.weightedcontentfromhtml(res) row_list[13] = text_actions.contentfromhtml(res) # for i in range(len(row_list)): # row_list[i] = row_list[i].decode("utf-8", "ignore") row = tuple(row_list) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) if (len(row[13]) == 0): global ERR_ASYNC_NO_CONTENT_IN_SCRAPING ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1 pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row[0], row[1])) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row[0], row[1], status, row[6][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} " .format(row[0], row[1], row[6])) global ERR_ASYNC_ON_URL_ERROR ERR_ASYNC_ON_URL_ERROR += 1 pc.printMsg( " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== " .format(row[0], row[1])) return row