def extract_app(detail_u): app={} app['unique_package']=get_unique_package(detail_u) try: utilities.merge_dicts_add_values(app,scrape_detail(detail_u)) except Exception as e: print >> sys.stderr ,"! ERROR IN GETTING DATA FROM THE APPLICATION'S DETAIL PAGE ON THE MARKET SITE: %s" % detail_u print >> sys.stderr, e traceback.print_exc() app['developer_page_url']=detail_u #print "app" #print app # at this point there may be nothing on this dict except 'developer_page_url' (if this errored out above) dev_page_u=app['developer_page_url'] try: dev_homesite_u = get_dev_homesite_url(dev_page_u) #print "dev_homesite_u= "+dev_homesite_u resolved_dev_homesite_u=get(dev_homesite_u)[1] app['developer_homepage_url']=resolved_dev_homesite_u homesite_data=scrape_dev_homesite(resolved_dev_homesite_u) utilities.merge_dicts_add_values(app,homesite_data) except Exception as e: print "exception part 2 on "+ dev_page_u + " . This may **or may not!** be the url that is having a problem. It could be that the dev homepage is the problem." #it's possible that dev_homesite_u won't be defined yet, tho unlikely. so i need to print this instead. todo: note that this continues on and returns app, which will have a few values. it might be worth looking to see . but if this broke when trying to get the dev_homesite_u, then that page won't be cached, thus a second run will probably get the page successfully, assuming it's an temporary server issue (or a connectivity issue on our side) print e print e.args print "those are the e details" return app
def scrape_dev_homesite(u,allowable_recursion_depth=1): #print "scrape_dev_homesite for " + u app={} if re.compile('.*twitter.com.*',re.IGNORECASE).match(u): return {'twitter_contacts': [u]} # if the resolved_dev_homesite_u is a "twitter.com" profile, just use that url as the lone contact if re.compile('mailto:.*',re.IGNORECASE).match(u): return {'email_contacts': [u]} # if the resolved_dev_homesite_u is a "mailto" link, just use that url as the lone contact if re.compile('.*facebook.com.*',re.IGNORECASE).match(u): return {'contact links': [u]} try: app['email_contacts']=get_emails_in_page(u) except Exception as e: print "error in getting emails from the developer's site: "+u print e try: app['contact links']=get_links_to_contact_page(u) except Exception as e: print "error in getting contact links from the developer's site: "+u print e try: app['twitter_contacts'] = get_twitter_handles(u) except Exception as e: print "error in getting twitters from the developer's site: "+u print e if allowable_recursion_depth>0 and app.has_key('contact links'): for contact_link in app['contact links']: app_from_contact_page=scrape_dev_homesite(contact_link,allowable_recursion_depth=allowable_recursion_depth-1) utilities.merge_dicts_add_values(app,app_from_contact_page) return app