async def asyncFetchAll(csv_in, csv_out): """ INPUT: csv_src_file & csv_dest_file(to be written) NOTE: * Semaphore limit is: 500 * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time """ tasks = [] sem = asyncio.Semaphore(5) """ Initialize the output file """ headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_out, headers) connector = TCPConnector(limit=10) async with ClientSession(headers={'Connection': 'keep-alive'}, connector=connector) as session: with open(csv_in, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) global ENTRIES_TO_BE_WRITTEN for row in csv_reader: ENTRIES_TO_BE_WRITTEN += 1 if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) row["WeightedContent"] = row["Title"] + row[ "WeightedContent"] row["Content"] = row["Content"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_DIRECT WRITTEN_ENTRIES_ASYNC_DIRECT += 1 pc.printMsg( " \t\t ============== Done Writing into csv for <ID = {}><src= {} >=============== " .format(row["ID"], row["SourceSite"])) elif (row["Url"] and row["Title"]): task = asyncio.ensure_future( semaphoreSafeFetch(sem, row, session, csv_out)) tasks.append(task) responses = await asyncio.gather(*tasks) pc.printMsg( "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Total items to actually scrape(found w/o Content) = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n" .format(len(responses)))
async def asyncFetchAll(csv_in, csv_out): """ INPUT: csv_src_file & csv_dest_file(to be written) NOTE: * Semaphore limit is: 500 * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time """ tasks = [] sem = asyncio.Semaphore(1000) """ Initialize the output file """ headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_out, headers) connector = TCPConnector(limit=0) async with ClientSession(headers={'Connection': 'keep-alive'}, connector=connector) as session: with open(csv_in, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 global ENTRIES_TO_BE_WRITTEN for row in csv_reader: ENTRIES_TO_BE_WRITTEN += 1 if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) row["WeightedContent"] = text_actions.clean_text( row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]) row["Content"] = text_actions.clean_text( row["Content"]) + text_actions.getUrlString( row["Content"]) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], row["Content"], row["WeightedContent"], ] csv_functions.putToCsv(csv_out, entry) global WRITTEN_ENTRIES_ASYNC_DIRECT WRITTEN_ENTRIES_ASYNC_DIRECT += 1 pc.printMsg( " \t\t ============== Done Writing into csv for <ID = {}><src= {} >=============== " .format(row["ID"], row["SourceSite"])) elif (row["Url"] and row["Title"]): task = asyncio.ensure_future( semaphoreSafeFetch(sem, row, session)) tasks.append(task) responses = await asyncio.gather(*tasks) pc.printMsg( "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ len(responses):: to be scraped = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n" .format(len(responses))) for row in responses: if row["Content"] or row["Title"]: if len( row["Content"] ) == 0: # that means url was hit successfully and content was generated row["Content"] = row["Title"] entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], row["Content"], row["WeightedContent"], ] await write_result(csv_out, entry) # csv_functions.putToCsv(csv_out, entry) global WRITTEN_ENTRIES_ASYNC_SCRAPED WRITTEN_ENTRIES_ASYNC_SCRAPED += 1 pc.printMsg( " \t\t ============== Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) else: pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx Skipping for <ID = {}><src= {} > As No Content & Title xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row["ID"], row["SourceSite"]))
def RunSync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url. * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ pc.printMsg( '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n' .format(datetime.fromtimestamp(ts), 'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '.csv' csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '_wc_sync.csv' index = 1 headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_dest_file, headers) f = csv.writer(open(csv_dest_file, "w")) # Flush the old file f.writerow([ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ]) with open(csv_src_file, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 for row in csv_reader: if line_count == 0: print(f'Headers are {", ".join(row)}') line_count += 1 #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]), #add the url-words too text_actions.clean_text(row["Content"]) + text_actions.getUrlString(row["Content"]) ] global WRITTEN_ENTRIES_SYNC WRITTEN_ENTRIES_SYNC += 1 f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) #CHECK2(pre scraping): if(url == NULL)=>discard #CHECK3(pre scraping): if (row["title"]==NULL)=>discard elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)): pc.printWarn( "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) time.sleep(0.0001) try: # response = web_requests.hitGetWithRetry(url,TIMEOUT=10) response = web_requests.hitGetWithRetry( row["Url"], '', False, 2, 0.5, 60) # if response.status_code == 200: if response != -1: # content = text_actions.contentfromhtml(response) #NOTE: for sync content = text_actions.contentfromhtml( response.text) #NOTE: for Async urlstrings = text_actions.getUrlString(content) content += urlstrings #add the url-words too # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too #NOTE: for sync weightedcontent = text_actions.weightedcontentfromhtml( response.text ) + row[ "Title"] + urlstrings #add the url-words too #NOTE: for async line_count += 1 #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"]) if (len(content) == 0): content = row["Title"] weightedcontent = row["Title"] else: entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(weightedcontent), text_actions.clean_text(content) ] f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) pc.printMsg( "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}" .format( row["ID"], time.strftime("%H:%M:%S", time.localtime()))) else: global SKIPPED_SYNC SKIPPED_SYNC += 1 pc.printErr( "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , " .format(row["ID"], row["Url"])) except Exception as e: global FAILED_SYNC FAILED_SYNC += 1 pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()), e)) pass pc.printMsg( "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n" .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) pc.printMsg( "\n----------------------------------------------------------------------------------\n" ) pc.printMsg( "|\tWRITTEN_ENTRIES_SYNC \t | \t {} \t|".format(WRITTEN_ENTRIES_SYNC)) pc.printMsg("|\tSKIPPED_SYNC \t | \t {} \t|".format(SKIPPED_SYNC)) pc.printMsg("|\tFAILED_SYNC \t | \t {} \t|".format(FAILED_SYNC)) pc.printMsg( "\n----------------------------------------------------------------------------------\n" )
def cleanNcheckAsyncOutput(csv_in, csv_out): """ Analyse the created & input fles Also, cleans Content & WeightedContent-> put in new file, delete the old one Variables: * NO_LINES_IN_INPUT_CSV * NO_LINES_IN_OUTPUT_CSV * NO_LINES_IN_OUTPUT_WITHOUT_TITLE * NO_LINES_IN_OUTPUT_WITHOUT_URL * NO_LINES_IN_OUTPUT_WITHOUT_CONTENT """ f = open(csv_in, "r+") f.fseek(0) # reach to first line reader = csv.reader(f) NO_LINES_IN_INPUT_CSV = len(list(reader)) """ Now check and create new "cleaned" file """ headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_final_out = os.path.join("F", csv_out) csv_functions.creteCsvFile(csv_final_out, headers) pc.prCyan( " ========================== NOW CREATING FINAL OUTPUT FILE: {} ===========================" .format(csv_final_out)) line_count = 0 with open(csv_out, mode='r') as r, open(csv_final_out, 'a+', newline='') as f: reader = csv.DictReader(r) writer = csv.writer(f) NO_LINES_IN_OUTPUT_CSV = 0 for row in reader: if (line_count == 0): # skipping headers line_count += 1 else: url_string_content = text_actions.getUrlString(row["Content"]) content = text_actions.clean_text(row["Content"]) weighted_content = text_actions.clean_text( row["WeightedContent"]) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], weighted_content + url_string_content, content, ] writer.writerow(entry) NO_LINES_IN_OUTPUT_CSV += 1 if (len(row["Title"]) == 0): NO_LINES_IN_OUTPUT_WITHOUT_TITLE += 1 if (len(row["Url"]) == 0): NO_LINES_IN_OUTPUT_WITHOUT_URL += 1 if (len(row["Content"]) == 0): NO_LINES_IN_OUTPUT_WITHOUT_CONTENT += 1 #TODO: os.remove(csv_in) %% rename pc.printWarn( "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~ Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" ) pc.printWarn( "|\t NO_LINES_IN_INPUT_CSV \t | \t {} \t|".format( NO_LINES_IN_INPUT_CSV)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_CSV \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_CSV)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_WITHOUT_TITLE \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_WITHOUT_TITLE)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_WITHOUT_URL \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_WITHOUT_URL)) pc.printWarn( "|\t NO_LINES_IN_OUTPUT_WITHOUT_CONTENT \t | \t {} \t|".format( NO_LINES_IN_OUTPUT_WITHOUT_CONTENT))