def __init__(self, *a, **kw): super(AmazonAPISpider, self).__init__(*a, **kw) self.project_conf = get_project_conf() self.amazon_apis = {} self.categories_to_scrape = {} self.brands_by_category_id = {} self._review_scrape = {} self.publishers = {}
def main(): """ Scheduler scripts, reads from the database all schedules and determines if any spider should be executed at this time. """ #path = dirname(realpath(__file__)) project_conf = get_project_conf() scheduler = Scheduler(project_conf) scheduler.run_scheduler()
def __init__(self, country): project_conf = get_project_conf() neo_host = project_conf.get("NEO4J", "host") user = project_conf.get("NEO4J", "username") password = project_conf.get("NEO4J", "password") logging.getLogger("py2neo.batch").setLevel(logging.WARNING) logging.getLogger("py2neo.cypher").setLevel(logging.WARNING) logging.getLogger("httpstream").setLevel(logging.WARNING) authenticate(neo_host, user, password) self.graph = Graph("http://%s/db/data/" % neo_host) try: self.graph.schema.create_uniqueness_constraint("Category", "id") except: pass self.categories = self.get_categories(country)
def __init__(self, *a, **kw): super(AlaSpider, self).__init__(self.name, **kw) self.project_conf = get_project_conf() self.setup_logger() self.setup_logger_for_each_spider() self.spider_conf = get_source_conf(self.name) self.mysql_manager = MysqlManager(self.project_conf, self._logger) self.set_proxy() self.skip_categories = [] self.active_sel_requests = 0 self.active_browsers = 0 self.request_queue = Queue() self.queue_sizes = [] self.input_start_url = kw.get('start_url', None) self.parse_function_name = kw.get('parse_function', None)
def __init__(self, spider): logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) self.project_conf = get_project_conf() self.spider = spider self.endpoint = self.spider.endpoint
def main(): project_conf = get_project_conf() finish_path = project_conf.get("OUTPUT", 'finished_directory') amazon_path = project_conf.get("OUTPUT", 'amazon_directory') filename_re = "([\w_]+)-(\d{8}_\d{6})-(\d+)\.csv" host_re = "\w+(\d{3})" files = [f for f in listdir(finish_path) if isfile(join(finish_path, f))] processed_times = set() filenames = {} hostname = socket.gethostname() host_match = re.match(host_re, hostname) try: host_suffix = host_match.group(1) except: pass # raise Exception("Illegal hostname %s" % hostname ) for file in files: match = re.match(filename_re, file) if not match: print("bad file name! %s" % file) continue timestamp = match.group(2) source_id = match.group(3) processed_key = "%s-%s" % (source_id, timestamp) if int(source_id) not in amazon_source_ids: continue if processed_key in processed_times: continue new_timestamp = datetime.now().strftime('%Y%m%d_%H') #new_timestamp = new_timestamp + ("0%s" % host_suffix) new_timestamp = new_timestamp + ("0000") for filetype in ['reviews', 'products', 'product_id']: source_filesnames = filenames.get(source_id, {}) new_filename = source_filesnames.get(filetype, None) if not new_filename: new_filename = "%s-%s-%s.csv" % (filetype, new_timestamp, source_id) new_filepath = join(amazon_path, new_filename) source_filesnames[filetype] = new_filepath filenames[source_id] = source_filesnames old_filename = "%s-%s-%s.csv" % (filetype, timestamp, source_id) old_filepath = join(finish_path, old_filename) has_header = isfile(new_filepath) and getsize(new_filepath) > 0 #if file and it is not empty exists then has header if isfile(old_filepath): with open(new_filepath, "a+") as new_file: with open(old_filepath, 'r') as old_file: if has_header: try: next(old_file) except StopIteration, e: pass for line in old_file: new_file.write(line) unlink(old_filepath) processed_times.add(processed_key)
def main(): path = dirname(realpath(__file__)) project_conf = get_project_conf() consumer = SpiderConsumer(project_conf, path) consumer.run()