Example #1
0
def get_page(in_url, header_type):
    init_logging()
    global http_logger
    http_logger = logging.getLogger(__name__)
    try:
        # init_tor(header_type)
        req = Request(in_url, data=None, headers=define_headers(header_type))
        html = urlopen(req)
    except HTTPError as e:
        http_logger.error("URL: %s - HTTP error: %s " % (in_url, e))
    except URLError as e:
        http_logger.error("URL: %s - Server is not reachable: %s" % (in_url, e))
    except http.client.HTTPException as e:
        http_logger.error(e)
    else:
        http_logger.info("Retrieved requested URL: %s" % in_url.rstrip())

    base_url = get_base_url(in_url)

    try:
        bsObj = BeautifulSoup(html, 'lxml')
    except AttributeError as e:
        http_logger.error("Page was not found: %s" % e)
    else:
        if bsObj is None:
            http_logger.info("Page has no data: %s" % e)
        else:
            return(bsObj, base_url)
Example #2
0
 def __init__(self,):
     """Constructor for Gmap"""
     init_logging()
     self.logger = logging.getLogger()
     with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "google_config.yml"), "r") as fh:
         settings = yaml.load(fh)
     self.api_key = settings['GOOGLE_API_KEY']
     self.search_type = settings['GOOGLE_SEARCH_METHOD']
Example #3
0
 def __init__(self, ):
     """Constructor for Gmap"""
     init_logging()
     self.logger = logging.getLogger()
     with open(
             os.path.join(os.path.dirname(os.path.abspath(__file__)),
                          "google_config.yml"), "r") as fh:
         settings = yaml.load(fh)
     self.api_key = settings['GOOGLE_API_KEY']
     self.search_type = settings['GOOGLE_SEARCH_METHOD']
Example #4
0
    def __init__(self):
        init_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("Job started and logging enabled")

        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"config.yml"), "r") as fh:
            settings = yaml.load(fh)

        self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])
        self.driver.set_window_size(1024, 768)
        self.shipping_rate = 0.75  # $rate/lb  # TODO: shift this to AZ class
        self.outfile = "../data/test.csv"
        self.fieldnames = ('net', 'roi', 'name', 'price', 'az_price', 'weight',
                           'az_sales_rank', 'az_match', 'url', 'img', 'az_url', 'az_asin')
        self.url_cats = settings['toys']
        self.site_url = settings['site_url']
        self.page_url = settings['page_url']
        self.base_url = strip_final_slash(get_base_url(self.site_url))
        self.az = AZ()
        self.depth_limit = settings['depth_limit']
Example #5
0
    def __init__(self, state_name='WY'):
        """Constructor for Dealer search object"""
        init_logging()
        self.logger = logging.getLogger()
        self.logger.info(
            "PlaceFinder Search object initialized and logging enabled...")

        self.gfind = goog.Gmap()
        self.state_name = state_name
        data_dir = '../../data/output/'
        if os.path.exists(data_dir):
            self.outfile = '{data_dir}{state}.tsv'.format(
                data_dir=data_dir, state=self.state_name)
        else:
            self.logger.error(
                "Data output directory mis-configured in PlaceFinder.py....exiting..."
            )
            sys.exit()
        self.fieldnames = ('name', 'web', 'address', 'city', 'state', 'zip',
                           'phone')
Example #6
0
    def __init__(self):
        init_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("Amazon object initialized")

        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "az_config.yml"), "r") as fh:
            settings = yaml.load(fh)

        self.db = Mysql(settings['db_config'])

        self.access_key = settings['access_key_id']
        self.secret_key = settings['secret_key_id']
        self.associate_tag = settings['associate_tag']
        self.default_weight = settings['default_weight']
        self.az_price = None
        self.az_asin = None
        self.az_sales_rank = None
        self.az_url = None
        self.az_match = None

        self.amazon = AmazonAPI(self.access_key, self.secret_key, self.associate_tag)
Example #7
0
 def __init__(self, ):
     """Constructor for LaundryFinder"""
     init_logging()
     self.logger = logging.getLogger()
     self.logger.info(
         "LaundryFinder Search object initialized and logging enabled...")
     self.gfind = goog.Gmap()
     out_data_dir = '../data/output'
     in_data_dir = '../data/input'
     self.in_data = in_data_dir
     if os.path.exists(out_data_dir):
         self.outfile = '{data_dir}/laundrysearch-{now}.tsv'.format(
             data_dir=out_data_dir,
             now=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
     else:
         self.logger.error(
             "Data output directory mis-configured in LaundryFinder.py....exiting..."
         )
         sys.exit()
     self.fieldnames = ('target-zip', 'name', 'score', 'address', 'plid',
                        'id')
Example #8
0
    def __init__(self, state_name='WY'):
        """Constructor for Dealer search object"""
        init_logging()
        self.logger = logging.getLogger()
        self.logger.info("PlaceFinder Search object initialized and logging enabled...")

        self.gfind = goog.Gmap()
        self.state_name = state_name
        data_dir = '../../data/output/'
        if os.path.exists(data_dir):
            self.outfile = '{data_dir}{state}.tsv'.format(data_dir=data_dir, state=self.state_name)
        else:
            self.logger.error("Data output directory mis-configured in PlaceFinder.py....exiting...")
            sys.exit()
        self.fieldnames = (
            'name',
            'web',
            'address',
            'city',
            'state',
            'zip',
            'phone')
Example #9
0
    def __init__(self):
        init_logging(default_path='../loggerUtils/logging.yml')
        self.logger = logging.getLogger(__name__)
        self.logger.info("Wiki Geo object initialized and logging enabled")

        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"wiki_config.yml"), "r") as fh:
            settings = yaml.load(fh)

        self.driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])
        self.driver.set_window_size(1024, 768)
        self.outfile = settings['output']
        self.depth_limit = settings['depth_limit']
        self.debug = settings['debug']
        self.reuse = settings['reuse']
        self.fieldnames = ('FIPS', 'GNIS', 'area-codes', 'county', 'county-url', 'density-2010-sqkm',
                           'density-2010-sqmi', 'elevation-ft', 'elevation-m', 'geohack-url',
                           'land-area', 'lat', 'location-img', 'census-map',
                           'long', 'place-name', 'place-type', 'place-url',
                           'place-www', 'pop-2010', 'pop-estimate', 'state',
                           'state-url', 'total-area', 'water-area', 'zips')
        self.top_url = settings['top_url']
        self.base_url = strip_final_slash(get_base_url(self.top_url))