def __init__(self, *a, **kw):
     super(AmazonAPISpider, self).__init__(*a, **kw)
     self.project_conf = get_project_conf()
     self.amazon_apis = {}
     self.categories_to_scrape = {}
     self.brands_by_category_id = {}
     self._review_scrape = {}
     self.publishers = {}
Example #2
0
def main():
    """ Scheduler scripts, reads from the database all
        schedules and determines if any spider should
        be executed at this time.
    """
    #path = dirname(realpath(__file__))
    project_conf = get_project_conf()
    scheduler = Scheduler(project_conf)
    scheduler.run_scheduler()
Example #3
0
 def __init__(self, country):
     project_conf = get_project_conf()
     neo_host = project_conf.get("NEO4J", "host")
     user = project_conf.get("NEO4J", "username")
     password = project_conf.get("NEO4J", "password")
     logging.getLogger("py2neo.batch").setLevel(logging.WARNING)
     logging.getLogger("py2neo.cypher").setLevel(logging.WARNING)
     logging.getLogger("httpstream").setLevel(logging.WARNING)
     authenticate(neo_host, user, password)
     self.graph = Graph("http://%s/db/data/" % neo_host)
     try:
         self.graph.schema.create_uniqueness_constraint("Category", "id")
     except:
         pass
     self.categories = self.get_categories(country)
Example #4
0
 def __init__(self, *a, **kw):
     super(AlaSpider, self).__init__(self.name, **kw)
     self.project_conf = get_project_conf()
     self.setup_logger()
     self.setup_logger_for_each_spider()
     self.spider_conf = get_source_conf(self.name)
     self.mysql_manager = MysqlManager(self.project_conf, self._logger)
     self.set_proxy()
     self.skip_categories = []
     self.active_sel_requests = 0
     self.active_browsers = 0
     self.request_queue = Queue()
     self.queue_sizes = []
     self.input_start_url = kw.get('start_url', None)
     self.parse_function_name = kw.get('parse_function', None)
Example #5
0
 def __init__(self, spider):
     logging.getLogger("requests").setLevel(logging.WARNING)
     logging.getLogger("urllib3").setLevel(logging.WARNING)
     self.project_conf = get_project_conf()
     self.spider = spider
     self.endpoint = self.spider.endpoint
Example #6
0
def main():
    project_conf = get_project_conf()
    finish_path = project_conf.get("OUTPUT", 'finished_directory')
    amazon_path = project_conf.get("OUTPUT", 'amazon_directory')

    filename_re = "([\w_]+)-(\d{8}_\d{6})-(\d+)\.csv"
    host_re = "\w+(\d{3})"
    files = [f for f in listdir(finish_path) if isfile(join(finish_path, f))]
    processed_times = set()
    filenames = {}

    hostname = socket.gethostname()
    host_match = re.match(host_re, hostname)
    try:
        host_suffix = host_match.group(1)
    except:
        pass
    #    raise Exception("Illegal hostname %s" % hostname )

    for file in files:
        match = re.match(filename_re, file)
        if not match:
            print("bad file name! %s" % file)
            continue

        timestamp = match.group(2)
        source_id = match.group(3)

        processed_key = "%s-%s" % (source_id, timestamp)

        if int(source_id) not in amazon_source_ids:
            continue

        if processed_key in processed_times:
            continue

        new_timestamp = datetime.now().strftime('%Y%m%d_%H')
        #new_timestamp = new_timestamp + ("0%s" % host_suffix)
        new_timestamp = new_timestamp + ("0000")
        for filetype in ['reviews', 'products', 'product_id']:
            source_filesnames = filenames.get(source_id, {})
            new_filename = source_filesnames.get(filetype, None)
            if not new_filename:
                new_filename = "%s-%s-%s.csv" % (filetype, new_timestamp,
                                                 source_id)

            new_filepath = join(amazon_path, new_filename)
            source_filesnames[filetype] = new_filepath
            filenames[source_id] = source_filesnames

            old_filename = "%s-%s-%s.csv" % (filetype, timestamp, source_id)
            old_filepath = join(finish_path, old_filename)
            has_header = isfile(new_filepath) and getsize(new_filepath) > 0
            #if file and it is not empty exists then has header
            if isfile(old_filepath):
                with open(new_filepath, "a+") as new_file:
                    with open(old_filepath, 'r') as old_file:
                        if has_header:
                            try:
                                next(old_file)
                            except StopIteration, e:
                                pass
                        for line in old_file:
                            new_file.write(line)
                unlink(old_filepath)
        processed_times.add(processed_key)
Example #7
0
def main():
    path = dirname(realpath(__file__))
    project_conf = get_project_conf()
    consumer = SpiderConsumer(project_conf, path)
    consumer.run()