def single(article_id, outputcsv): """produce scraper report of apps on a single applist given the article id - some output values are dummied because they are usually obtained from the input file (in main)""" APP_LIST_IDX_app_id = 0 APP_LIST_IDX_itunes_link = 1 ARTICLE_TYPE = 2 # open output file with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh: writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC) # write out headings # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop writer.writerow([ 'article_id', 'article_url', 'published_at', 'app_id', 'itunes_link' ]) # work out applist url based on article ID article_url = articleUrls(ARTICLE_TYPE, article_id, 'dont-know-slug') try: article_url, soup = openurlforsoup(article_url) except urllib.error.HTTPError: try: article_url, soup = openurlforsoup(article_url) # try again except urllib.error.HTTPError as e: print("HTTP error opening article_id {}: {}".format( article_id, e.reason)) raise except e: print("General error opening article id {}: {}".format( article_id, e.reason)) raise published_at = '????' apps = getappsfromlist(soup) for app in apps: writer.writerow([ article_id, article_url, published_at, app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link] ]) print(str(article_url)) return None
def single(article_id, outputcsv): """produce scraper report of apps on a single applist given the article id - some output values are dummied because they are usually obtained from the input file (in main)""" APP_LIST_IDX_app_id = 0 APP_LIST_IDX_itunes_link = 1 ARTICLE_TYPE = 2 # open output file with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh: writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC) # write out headings # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop writer.writerow(['article_id', 'article_url', 'published_at', 'app_id', 'itunes_link']) # work out applist url based on article ID article_url = articleUrls(ARTICLE_TYPE, article_id, 'dont-know-slug') try: article_url, soup = openurlforsoup(article_url) except urllib.error.HTTPError: try: article_url, soup = openurlforsoup(article_url) # try again except urllib.error.HTTPError as e: print("HTTP error opening article_id {}: {}".format(article_id, e.reason)) raise except e: print("General error opening article id {}: {}".format(article_id, e.reason)) raise published_at = '????' apps = getappsfromlist(soup) for app in apps: writer.writerow([article_id, article_url, published_at, app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link]]) print(str(article_url)) return None
def main( inputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv', hasheader=True, outputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\applist_scraper_report.csv' ): APP_LIST_IDX_app_id = 0 APP_LIST_IDX_itunes_link = 1 # open output file with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh: writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC) # write out headings # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop writer.writerow([ 'article_id', 'article_url', 'published_at', 'app_id', 'itunes_link' ]) # open input file with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh: reader = csv.DictReader(inputfileh, fieldnames=('article_id', 'article_type', 'published_at', 'slug'), delimiter=',', quotechar='"') i = 1 if hasheader: next(reader) # skip header row for row in reader: if i % 10 == 0: print('Record: {}'.format(i)) article_id = row['article_id'] article_type = row['article_type'] slug = row['slug'] article_url = articleUrls(article_type, article_id, slug) print(str(article_url)) # replace the following row with value scraped from page #published_at = row['published_at'] # scrape the apps on this article #writer.writerow([article_id, article_url, 'APPS']) try: article_url, soup = openurlforsoup(str(article_url)) except urllib.error.HTTPError: print("Error while trying to open article_url: {0}".format( article_url)) try: article_url, soup = openurlforsoup( article_url) # try again except urllib.error.HTTPError as e: print("HTTP error opening article_id {}: {}".format( article_id, e.reason)) writer.writerow([ article_id, article_url, "Error opening article: {0}".format(e.reason), "-", "-" ]) continue except e: print("General error opening article id {}: {}".format( article_id, e.reason)) raise published_at = soup.findAll('p', class_='article-date small') if not published_at: print( (' This page\'s HTML does not contain class \'{}\''. format('article-date small')).encode( 'ascii', 'ignore').decode('utf-8')) published_at = 'Couldn' 't find date on page' else: published_at = published_at[0].getText() # extracts a string like '21 Jun 2016, by\xa0Cherry Mae Torrevillas' # so get only text before first comma by splitting at most once on the separator published_at = published_at.split(',', 1)[0] apps = getappsfromlist(soup) for app in apps: writer.writerow([ article_id, article_url, published_at, app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link] ]) i += 1 #if i==50:break inputfileh.close() outfileh.close() return None
def main( inputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv', outputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\article_visits_up.csv', start_date_str='2016-01-01', end_date_str='2016-01-31', trustslugs=True): """ Generate Google Analytics report for articles listed in the input CSV. If trustslugs = False, then do http queries of appPicker for the real URLs instead of constructing URLs from the slugs in the input CSV """ broker = pageanalytics.Broker() # open output file with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh: writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC) # write out headings # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop writer.writerow([ 'article_id', 'published_at', 'page_url', 'users', 'new_users', 'sessions', 'bounces', 'bounce_rate', 'avg_session', 'page_views', 'avg_time_on_page', 'avg_page_load_secs', 'sessions_per_user' ]) # open input file with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh: reader = csv.DictReader(inputfileh, fieldnames=('article_id', 'article_type', 'published_at', 'slug'), delimiter=',', quotechar='"') i = 1 for i in range(1): next(reader) # skip header row for row in reader: article_id = row['article_id'] article_type = row['article_type'] published_at = row.get('published_at', 'not provided') slug = row.get('slug', 'blah') if trustslugs: article_url = articleUrls(article_type, article_id, slug) else: try: article_url = articleUrls(article_type, article_id, 'blah').realurl() except ArticleLoadError as e: writer.writerow([ article_id, published_at, 'Error: {0}'.format(e.customMessage), 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL' ]) print('Could not open URL: {0}'.format( e.customMessage)) continue print( str(article_url).replace('', '')) # get Google Analytics results for period between start and end dates try: garesults = broker.get_results( pagePath=str(article_url).replace( '', ''), start_date=start_date_str, end_date=end_date_str, metrics= 'ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate' ) except googleapiclient.errors.HttpError as e: time.sleep(5) garesults = broker.get_results( pagePath=str(article_url).replace( '', ''), start_date=start_date_str, end_date=end_date_str, metrics= 'ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate' ) metrics = google.extract_metrics(garesults) # print('{}'.format(json.dumps(row))) writer.writerow([ article_id, published_at, article_url, metrics.get('ga:users', 0), metrics.get('ga:newUsers', 0), metrics.get('ga:sessions'), metrics.get('ga:bounces', 0), metrics.get('ga:bounceRate', 0), metrics.get('ga:avgSessionDuration', 0), metrics.get('ga:pageviews', 0), metrics.get('ga:avgTimeOnPage', 0), metrics.get('ga:avgPageLoadTime', 0), metrics.get('ga:sessionsPerUser', 0) ]) i += 1 if i % 10 == 0: print('Record: {}'.format(i)) outfileh.flush() #if i==10:break inputfileh.close() outfileh.close() return None
def main(inputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv', hasheader=True, outputcsv = 'D:\\projects\\AppPicker\\reports\\best of lists performance\\applist_scraper_report.csv'): APP_LIST_IDX_app_id = 0 APP_LIST_IDX_itunes_link = 1 # open output file with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh: writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC) # write out headings # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop writer.writerow(['article_id', 'article_url', 'published_at', 'app_id', 'itunes_link']) # open input file with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh: reader = csv.DictReader(inputfileh, fieldnames=('article_id', 'article_type', 'published_at', 'slug'), delimiter=',', quotechar='"') i = 1 if hasheader: next(reader) # skip header row for row in reader: if i % 10 == 0: print('Record: {}'.format(i)) article_id = row['article_id'] article_type = row['article_type'] slug = row['slug'] article_url = articleUrls(article_type, article_id, slug) print(str(article_url)) # replace the following row with value scraped from page #published_at = row['published_at'] # scrape the apps on this article #writer.writerow([article_id, article_url, 'APPS']) try: article_url, soup = openurlforsoup(str(article_url)) except urllib.error.HTTPError: print("Error while trying to open article_url: {0}".format(article_url)) try: article_url, soup = openurlforsoup(article_url) # try again except urllib.error.HTTPError as e: print("HTTP error opening article_id {}: {}".format(article_id, e.reason)) writer.writerow([article_id, article_url, "Error opening article: {0}".format(e.reason),"-","-"]) continue except e: print("General error opening article id {}: {}".format(article_id, e.reason)) raise published_at = soup.findAll('p',class_='article-date small') if not published_at: print((' This page\'s HTML does not contain class \'{}\''.format('article-date small')).encode('ascii', 'ignore').decode('utf-8')) published_at = 'Couldn''t find date on page' else: published_at = published_at[0].getText() # extracts a string like '21 Jun 2016, by\xa0Cherry Mae Torrevillas' # so get only text before first comma by splitting at most once on the separator published_at = published_at.split(',', 1)[0] apps = getappsfromlist(soup) for app in apps: writer.writerow([article_id, article_url, published_at, app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link]]) i += 1 #if i==50:break inputfileh.close() outfileh.close() return None
def main(inputcsv = 'D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv', outputcsv = 'D:\\projects\\AppPicker\\reports\\best of lists performance\\article_visits_up.csv', start_date_str = '2016-01-01', end_date_str = '2016-01-31', trustslugs = True): """ Generate Google Analytics report for articles listed in the input CSV. If trustslugs = False, then do http queries of appPicker for the real URLs instead of constructing URLs from the slugs in the input CSV """ broker = pageanalytics.Broker() # open output file with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh: writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC) # write out headings # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop writer.writerow(['article_id', 'published_at', 'page_url', 'users', 'new_users', 'sessions', 'bounces', 'bounce_rate', 'avg_session', 'page_views', 'avg_time_on_page', 'avg_page_load_secs', 'sessions_per_user']) # open input file with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh: reader = csv.DictReader(inputfileh, fieldnames=('article_id', 'article_type', 'published_at', 'slug'), delimiter=',', quotechar='"') i = 1 for i in range(1): next(reader) # skip header row for row in reader: article_id = row['article_id'] article_type = row['article_type'] published_at = row.get('published_at', 'not provided') slug = row.get('slug', 'blah') if trustslugs: article_url = articleUrls(article_type, article_id, slug) else: try: article_url = articleUrls(article_type, article_id, 'blah').realurl() except ArticleLoadError as e: writer.writerow([article_id, published_at, 'Error: {0}'.format(e.customMessage), 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL']) print('Could not open URL: {0}'.format(e.customMessage)) continue print(str(article_url).replace('','')) # get Google Analytics results for period between start and end dates try: garesults = broker.get_results(pagePath=str(article_url).replace('',''), start_date=start_date_str, end_date=end_date_str, metrics='ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate') except googleapiclient.errors.HttpError as e: time.sleep(5) garesults = broker.get_results(pagePath=str(article_url).replace('',''), start_date=start_date_str, end_date=end_date_str, metrics='ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate') metrics = google.extract_metrics(garesults) # print('{}'.format(json.dumps(row))) writer.writerow([article_id, published_at, article_url, metrics.get('ga:users',0), metrics.get('ga:newUsers',0), metrics.get('ga:sessions'), metrics.get('ga:bounces',0), metrics.get('ga:bounceRate',0), metrics.get('ga:avgSessionDuration',0), metrics.get('ga:pageviews',0), metrics.get('ga:avgTimeOnPage',0), metrics.get('ga:avgPageLoadTime',0), metrics.get('ga:sessionsPerUser',0)]) i += 1 if i % 10 == 0: print('Record: {}'.format(i)) outfileh.flush() #if i==10:break inputfileh.close() outfileh.close() return None