def get_page(in_url, header_type): init_logging() global http_logger http_logger = logging.getLogger(__name__) try: # init_tor(header_type) req = Request(in_url, data=None, headers=define_headers(header_type)) html = urlopen(req) except HTTPError as e: http_logger.error("URL: %s - HTTP error: %s " % (in_url, e)) except URLError as e: http_logger.error("URL: %s - Server is not reachable: %s" % (in_url, e)) except http.client.HTTPException as e: http_logger.error(e) else: http_logger.info("Retrieved requested URL: %s" % in_url.rstrip()) base_url = get_base_url(in_url) try: bsObj = BeautifulSoup(html, 'lxml') except AttributeError as e: http_logger.error("Page was not found: %s" % e) else: if bsObj is None: http_logger.info("Page has no data: %s" % e) else: return(bsObj, base_url)
def __init__(self,): """Constructor for Gmap""" init_logging() self.logger = logging.getLogger() with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "google_config.yml"), "r") as fh: settings = yaml.load(fh) self.api_key = settings['GOOGLE_API_KEY'] self.search_type = settings['GOOGLE_SEARCH_METHOD']
def __init__(self, ): """Constructor for Gmap""" init_logging() self.logger = logging.getLogger() with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), "google_config.yml"), "r") as fh: settings = yaml.load(fh) self.api_key = settings['GOOGLE_API_KEY'] self.search_type = settings['GOOGLE_SEARCH_METHOD']
def __init__(self): init_logging() self.logger = logging.getLogger(__name__) self.logger.info("Job started and logging enabled") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"config.yml"), "r") as fh: settings = yaml.load(fh) self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) self.driver.set_window_size(1024, 768) self.shipping_rate = 0.75 # $rate/lb # TODO: shift this to AZ class self.outfile = "../data/test.csv" self.fieldnames = ('net', 'roi', 'name', 'price', 'az_price', 'weight', 'az_sales_rank', 'az_match', 'url', 'img', 'az_url', 'az_asin') self.url_cats = settings['toys'] self.site_url = settings['site_url'] self.page_url = settings['page_url'] self.base_url = strip_final_slash(get_base_url(self.site_url)) self.az = AZ() self.depth_limit = settings['depth_limit']
def __init__(self, state_name='WY'): """Constructor for Dealer search object""" init_logging() self.logger = logging.getLogger() self.logger.info( "PlaceFinder Search object initialized and logging enabled...") self.gfind = goog.Gmap() self.state_name = state_name data_dir = '../../data/output/' if os.path.exists(data_dir): self.outfile = '{data_dir}{state}.tsv'.format( data_dir=data_dir, state=self.state_name) else: self.logger.error( "Data output directory mis-configured in PlaceFinder.py....exiting..." ) sys.exit() self.fieldnames = ('name', 'web', 'address', 'city', 'state', 'zip', 'phone')
def __init__(self): init_logging() self.logger = logging.getLogger(__name__) self.logger.info("Amazon object initialized") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "az_config.yml"), "r") as fh: settings = yaml.load(fh) self.db = Mysql(settings['db_config']) self.access_key = settings['access_key_id'] self.secret_key = settings['secret_key_id'] self.associate_tag = settings['associate_tag'] self.default_weight = settings['default_weight'] self.az_price = None self.az_asin = None self.az_sales_rank = None self.az_url = None self.az_match = None self.amazon = AmazonAPI(self.access_key, self.secret_key, self.associate_tag)
def __init__(self, ): """Constructor for LaundryFinder""" init_logging() self.logger = logging.getLogger() self.logger.info( "LaundryFinder Search object initialized and logging enabled...") self.gfind = goog.Gmap() out_data_dir = '../data/output' in_data_dir = '../data/input' self.in_data = in_data_dir if os.path.exists(out_data_dir): self.outfile = '{data_dir}/laundrysearch-{now}.tsv'.format( data_dir=out_data_dir, now=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) else: self.logger.error( "Data output directory mis-configured in LaundryFinder.py....exiting..." ) sys.exit() self.fieldnames = ('target-zip', 'name', 'score', 'address', 'plid', 'id')
def __init__(self, state_name='WY'): """Constructor for Dealer search object""" init_logging() self.logger = logging.getLogger() self.logger.info("PlaceFinder Search object initialized and logging enabled...") self.gfind = goog.Gmap() self.state_name = state_name data_dir = '../../data/output/' if os.path.exists(data_dir): self.outfile = '{data_dir}{state}.tsv'.format(data_dir=data_dir, state=self.state_name) else: self.logger.error("Data output directory mis-configured in PlaceFinder.py....exiting...") sys.exit() self.fieldnames = ( 'name', 'web', 'address', 'city', 'state', 'zip', 'phone')
def __init__(self): init_logging(default_path='../loggerUtils/logging.yml') self.logger = logging.getLogger(__name__) self.logger.info("Wiki Geo object initialized and logging enabled") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"wiki_config.yml"), "r") as fh: settings = yaml.load(fh) self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) self.driver.set_window_size(1024, 768) self.outfile = settings['output'] self.depth_limit = settings['depth_limit'] self.debug = settings['debug'] self.reuse = settings['reuse'] self.fieldnames = ('FIPS', 'GNIS', 'area-codes', 'county', 'county-url', 'density-2010-sqkm', 'density-2010-sqmi', 'elevation-ft', 'elevation-m', 'geohack-url', 'land-area', 'lat', 'location-img', 'census-map', 'long', 'place-name', 'place-type', 'place-url', 'place-www', 'pop-2010', 'pop-estimate', 'state', 'state-url', 'total-area', 'water-area', 'zips') self.top_url = settings['top_url'] self.base_url = strip_final_slash(get_base_url(self.top_url))