def scrape_and_process_reviews(country_code,language_code,unique_package_l=None, json_cache_path='new',resolved_urls_cache_path='new',now=None, no_cache=False,database='archive'): try: if not unique_package_l: print 'getting all unique_package values from db' unique_package_l=db.get_all_unique_packages('production') print 'unique_package_l count: '+str(len(unique_package_l)) if not now: now=datetime.datetime.now() if not no_cache: if json_cache_path=='new': json_cache_path=utilities.make_new_dated_path('../cache/reviews/json_','/',now) print "json cache in:"+json_cache_path os.mkdir(json_cache_path) else: if not json_cache_path.endswith('/'): json_cache_path+='/' if resolved_urls_cache_path=='new': resolved_urls_cache_path=utilities.make_new_dated_path('../cache/reviews/resolved_urls_','/',now) print "resolved_urls cache in:"+resolved_urls_cache_path os.mkdir(resolved_urls_cache_path) else: if not resolved_urls_cache_path.endswith('/'): resolved_urls_cache_path+='/' print 'now:' + str(now) timestamp=int(time.mktime(now.timetuple()) * 1000) print 'timestamp:' + str(timestamp) print 'json cache dir: '+json_cache_path print 'resolved_urls_cache dir: '+resolved_urls_cache_path log_path=utilities.make_new_dated_path('../../cache/reviews/scrape_and_process_reviews_','.log',now) #log_file=open(log_path,'w') #print "log file: "+str(log_file) #print "now logging to that file" #sys.stdout=log_file #sys.stderr=log_file #print "now logging to file" #print "sys.stdout: "+str(sys.stdout) print 'timestamp:' + str(timestamp) #print 'json cache dir: '+json_cache_path #print 'resolved_urls_cache dir: '+resolved_urls_cache_path inhale_reviews(country_code,language_code,unique_package_l, json_cache_path, resolved_urls_cache_path,now,no_cache=no_cache,database=database) print 'done with inhale_reviews' except Exception as e: print 'exception broken out to top level of scrape_and_process_reviews' traceback.print_exc() print 'switching output back to stdout and stderr' #sys.stdout=sys.__stdout__ #sys.stderr=sys.__stderr__ #try: # print 'log_file is a :' # print type(log_file) # print log_file # if type(log_file)==file: # print 'closing log file' # log_file.close() # else: # print 'no file used in logging.' #except: # print 'it appears we were logging to stdout. no need to close any log file.' print 'done with scrape' return timestamp
scrape_date=datetime.datetime.fromtimestamp(scrape_timestamp/1000) if not html_cache_path.endswith('/'): html_cache_path+='/' if not resolved_urls_cache_path.endswith('/'): resolved_urls_cache_path+='/' else: #online scape - generate paths and times for timestamps print '--online scrape--' print if scrape_timestamp_s=='new': scrape_date=extraction_date scrape_timestamp=(int(time.mktime(scrape_date.timetuple())*1000)) else: print 'PROBLEM: this is an online scrape yet the scrape_timestamp was specified. Bad.' raise Exception('PROBLEM: this is an online scrape yet the scrape_timestamp was specified. Bad.') if html_cache_path=='new': html_cache_path=utilities.make_new_dated_path('../cache/html_','_'+paid_or_free+'/',scrape_date) print "html cache in:"+html_cache_path os.mkdir(html_cache_path) else: print 'PROBLEM: this is an online scrape yet the cache path for html was specified. Bad.' raise Exception('PROBLEM: this is an online scrape yet the cache path for html was specified. Bad.') if resolved_urls_cache_path=='new': resolved_urls_cache_path=utilities.make_new_dated_path('../cache/resolved_urls_','_'+paid_or_free+'/',scrape_date) print "resolved_urls cache in:"+resolved_urls_cache_path os.mkdir(resolved_urls_cache_path) else: print 'PROBLEM: this is an online scrape yet the cache path for resolved urls was specified. Bad.' raise Exception('PROBLEM: this is an online scrape yet the cache path for resolved urls was specified. Bad.') print print 'scrape_date converted to a timestamp: '+str(int(time.mktime(scrape_date.timetuple())*1000)) # int(time.mktime((datetime.datetime(2011,6,27,0,0,1)).timetuple())*1000)