Beispiel #1
0
    def create(cls, key, label, user, description=None, locale=None):
        from group import Group
        from membership import Membership
        from page import Page

        instance = Instance(unicode(key).lower(), label, user)
        instance.description = description
        instance.default_group = Group.by_code(Group.INSTANCE_DEFAULT)
        if locale is not None:
            instance.locale = locale
        meta.Session.add(instance)
        supervisor_group = Group.by_code(Group.CODE_SUPERVISOR)
        membership = Membership(user, instance, supervisor_group,
                                approved=True)
        meta.Session.add(membership)
        if config.get_bool('adhocracy.create_initial_instance_page'):
            Page.create(instance, label, u"", user)

        # Autojoin the user in instances
        config_autojoin = config.get('adhocracy.instances.autojoin')
        if (config_autojoin and
                (config_autojoin == 'ALL' or
                 key in (k.strip() for k in config_autojoin.split(',')))):
            users = adhocracy.model.User.all()
            for u in users:
                autojoin_membership = Membership(u, instance,
                                                 instance.default_group)
                meta.Session.add(autojoin_membership)

        meta.Session.flush()
        return instance
Beispiel #2
0
    def create(cls, key, label, user, description=None, locale=None):
        from group import Group
        from membership import Membership
        from page import Page

        instance = Instance(unicode(key).lower(), label, user)
        instance.description = description
        instance.default_group = Group.by_code(Group.INSTANCE_DEFAULT)
        if locale is not None:
            instance.locale = locale
        meta.Session.add(instance)
        supervisor_group = Group.by_code(Group.CODE_SUPERVISOR)
        membership = Membership(user, instance, supervisor_group,
                                approved=True)
        meta.Session.add(membership)
        if config.get_bool('adhocracy.create_initial_instance_page'):
            Page.create(instance, label, u"", user)

        # Autojoin the user in instances
        config_autojoin = config.get('adhocracy.instances.autojoin')
        if (config_autojoin and
                (config_autojoin == 'ALL' or
                 key in (k.strip() for k in config_autojoin.split(',')))):
            users = adhocracy.model.User.all()
            for u in users:
                autojoin_membership = Membership(u, instance,
                                                 instance.default_group)
                meta.Session.add(autojoin_membership)

        meta.Session.flush()
        return instance
Beispiel #3
0
def crawler(name, query):
    '''Main Crawler for Job'''
    start = datetime.now()
    print name
    db = Database(name)
    db.create_colls()
    #get from source
    for n in db.sources.find():
        if n["url"] not in db.queue.distinct("url"):
            db.queue.insert(n)

    while db.queue.count > 0:

        print "Beginning crawl"
        # print "Number of seeds urls in sources databases:", db.sources.count()
        # print "Number of pending url to inspect:", len(db.queue.distinct("url"))
        for url in db.queue.distinct("url"):

            if url not in db.results.find({"url": url}):
                print url
                p = Page(url, query)
                if p.create():
                    a = Article()
                else:
                    print p.error_type

                #print "Links", p.outlinks
                #db.results.update(p.info, {'$push': {"date": datetime.today()}}, upsert=True)
                #db.results.insert(p.info)
                # if p.outlinks is not None:
                # 	try:
                # 		for n_url in p.outlinks:
                # 			if n_url is not None or  n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}):
                # 				# Checking correct url before is problematic
                # 				# next_p = Page(n_url, query)
                # 				# if next_p.clean_url(p.url) is not None:
                # 				print n_url
                # 				db.queue.insert({"url":n_url})
                # 	except mongo_err:
                # 		db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.today()}}, upsert=True)
                # elif p.error_type != 0:
                # 	''' if the page is not relevant do not store in db'''
                # 	db.log.update(p.bad_status(),{'$push':{"date": datetime.today()}}, upsert=True)
                # else:
                # 	continue

            db.queue.remove({"url": url})
            if db.queue.count() == 0:
                print db.stats()
                break

        if db.queue.count() == 0:
            print db.stats()
            break

    end = datetime.now()
    elapsed = end - start
    print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % (
        db.results.count(), db.sources.count(), name, elapsed)
    return True
Beispiel #4
0
    def create(cls, key, label, user, description=None, locale=None):
        from group import Group
        from membership import Membership
        from page import Page

        instance = Instance(unicode(key).lower(), label, user)
        instance.description = description
        instance.default_group = Group.by_code(Group.INSTANCE_DEFAULT)
        if locale is not None:
            instance.locale = locale
        meta.Session.add(instance)
        supervisor_group = Group.by_code(Group.CODE_SUPERVISOR)
        membership = Membership(user, instance, supervisor_group, approved=True)
        meta.Session.add(membership)
        Page.create(instance, label, u"", user)
        meta.Session.flush()
        return instance
Beispiel #5
0
    def add_page(self, binary):
        from page import Page

        page = Page.create(day=self)
        page.set_image(binary=binary)
        self.pages.append(page)
        self.pages.reorder()
        db.session.commit()
Beispiel #6
0
    def create(cls, key, label, user, description=None, locale=None):
        from group import Group
        from membership import Membership
        from page import Page

        instance = Instance(unicode(key).lower(), label, user)
        instance.description = description
        instance.default_group = Group.by_code(Group.INSTANCE_DEFAULT)
        if locale is not None:
            instance.locale = locale
        meta.Session.add(instance)
        supervisor_group = Group.by_code(Group.CODE_SUPERVISOR)
        membership = Membership(user, instance, supervisor_group,
                                approved=True)
        meta.Session.add(membership)
        Page.create(instance, label, u"", user)
        meta.Session.flush()
        return instance
Beispiel #7
0
	def crawl(self):
		self.discovery()
		start = datetime.now()
		while self.db.queue.count > 0:
			for url in self.db.queue.distinct("url"):
				print url, self.query
				p = Page(url, self.query)
				page = p.create()
				print page
				self.db.queue.remove({"url": url})
				if self.db.queue.count() == 0:
					break
			
			if self.db.queue.count() == 0:
				print self.db.stats()		
				break
		

		end = datetime.now()
		elapsed = end - start
		print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(self.db.results.count(),self.db.sources.count(),self.project, elapsed)
Beispiel #8
0
    def crawl(self):
        self.discovery()
        start = datetime.now()
        while self.db.queue.count > 0:
            for url in self.db.queue.distinct("url"):
                print url, self.query
                p = Page(url, self.query)
                page = p.create()
                print page
                self.db.queue.remove({"url": url})
                if self.db.queue.count() == 0:
                    break

            if self.db.queue.count() == 0:
                print self.db.stats()
                break

        end = datetime.now()
        elapsed = end - start
        print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % (
            self.db.results.count(), self.db.sources.count(), self.project,
            elapsed)
Beispiel #9
0
def crawler(name, query):
	'''Main Crawler for Job'''
	start = datetime.now()
	print name
	db = Database(name)
	db.create_colls()
	#get from source
	for n in db.sources.find():
		if n["url"] not in db.queue.distinct("url"):
			db.queue.insert(n)
		
	while db.queue.count > 0:

		print "Beginning crawl"
		# print "Number of seeds urls in sources databases:", db.sources.count()
		# print "Number of pending url to inspect:", len(db.queue.distinct("url"))
		for url in db.queue.distinct("url"):
			
			if url not in db.results.find({"url":url}):
				print url
				p = Page(url, query)
				if p.create():
					a = Article()
				else:
					print p.error_type
			
				
				#print "Links", p.outlinks
				#db.results.update(p.info, {'$push': {"date": datetime.today()}}, upsert=True)
				#db.results.insert(p.info)
				# if p.outlinks is not None:
				# 	try:
				# 		for n_url in p.outlinks:
				# 			if n_url is not None or  n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}):
				# 				# Checking correct url before is problematic
				# 				# next_p = Page(n_url, query)
				# 				# if next_p.clean_url(p.url) is not None:
				# 				print n_url
				# 				db.queue.insert({"url":n_url})
				# 	except mongo_err:
				# 		db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.today()}}, upsert=True)
				# elif p.error_type != 0:
				# 	''' if the page is not relevant do not store in db'''
				# 	db.log.update(p.bad_status(),{'$push':{"date": datetime.today()}}, upsert=True)
				# else:
				# 	continue

			db.queue.remove({"url": url})
			if db.queue.count() == 0:
				print db.stats()
				break
			
		if db.queue.count() == 0:
			print db.stats()		
			break
		

	end = datetime.now()
	elapsed = end - start
	print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(db.results.count(),db.sources.count(),name, elapsed)
	return True