예제 #1
0
	def __init__(self):
		self.__conf = Configure()
		self.__associations = Associations()
		self.__sites = Sites()
		resultList = []

		xReader = XMLReader()
		xParser = XMLParser()
		confTree = xReader.getTree('xml/conf.xml')
		if confTree == None:
			exit()
		searchParams = xParser.getSearchParams(confTree)
		searchSites = xParser.getSearchSites(confTree)
		pagesToSearch = xParser.getPagesToSearch(confTree)
		self.masterInspectionPath = xParser.getMIXML(confTree)

		self.__conf.setParams(searchSites, searchParams, pagesToSearch)

		keywordTree = xReader.getTree('xml/keywords.xml')
		fKeywordTree = xReader.getTree('xml/f_keywords.xml')
		if keywordTree == None or fKeywordTree == None:
			exit()
		keywords = xParser.getKeywords(keywordTree)
		fKeywords = xParser.getKeywords(fKeywordTree)
		avoids = xParser.getAvoids(keywordTree)
		fAvoids = xParser.getAvoids(fKeywordTree)

		self.__associations.setParams(keywords, avoids, fKeywords, fAvoids)

		sitesTree = xReader.getTree('xml/sites.xml')
		if sitesTree == None:
			exit()
		goodSites, badSites = xParser.getSites(sitesTree)

		self.__sites.setParams(goodSites, badSites)
예제 #2
0
    def associate(self, k_list, skip_probability=0, **kwargs):
        assert(len(k_list) == len(self._fragments))
        print_stats_every = kwargs.get('print_stats_every', 1000)

        self._k_list = k_list
        self._first_nonfull = [0 for _ in xrange(len(self._fragments))]
        self._last_usable = [0 for _ in xrange(len(self._fragments))]
        self._associations = Associations(self._k_list)
        self._dropped = set()
        self._skip_probability = skip_probability

        withtime = make_withtime()
        for row_id, row in enumerate(self._table):
            if row_id and not row_id % print_stats_every:
                logging.info(withtime('associating row: %i firsts: %s lasts: %s'
                    % (row_id, self._first_nonfull, self._last_usable)))

            association = self._extend_association(row, row_id)
            if association is None:
                self._dropped.add(row_id)
                logging.info('row_id {} dropped at first scan'.format(row_id))
            else:
                self._associations[row_id] = association
                self._update_pointers(association)

        self._opstack = deque()
        for fragment_id in xrange(len(self._fragments)):
            for group_id in self._get_nonfull_groups(fragment_id, False):
                if self._associations.get_group_size(fragment_id, group_id):
                    self._opstack.append((0, (fragment_id, group_id)))    # redistribute

        counter = count()
        while self._opstack:
            if not next(counter) % 10:
                logging.info(withtime('opstack length: %i' % len(self._opstack)))
            operation = self._opstack.pop()
            fn = self._delete_row if operation[0] else self._redistribute_group
            fn(*operation[1])

        logging.info(withtime('done after %i stack operations' % (next(counter))))
        return self._associations, self._dropped
예제 #3
0
class Loose:

    def __init__(self, table, constraints, fragments):
        self._table, self.tuples = table, len(table)
        self._fragments = map(table.to_indices, fragments)
        self._constraints = Constraints(map(table.to_indices, constraints), self._fragments)

    def _get_group_data(self, fragment_id, group_id):
        return ((row_id, self._table[row_id]) for row_id in self._associations.get_group(fragment_id, group_id))

    def _get_nonfull_groups(self, fragment_id, allow_skips=True, **kwargs):
        groups_to_check = xrange(self._first_nonfull[fragment_id], self._last_usable[fragment_id] + 1)
        return (group_id for group_id in groups_to_check
                if not self._associations.is_group_full(fragment_id, group_id))
                # uncomment this to allow skips
                # if not allow_skips or not self._skip_probability or random() > self._skip_probability)

    def _are_groups_alike(self, current_group, other_group, fragment_id, constraint_id, current_row):
        current_rows = self._get_group_data(fragment_id, current_group)
        if current_row:
            current_rows = chain([(0, current_row)], current_rows)

        for _, row1 in current_rows:
            for _, row2 in self._get_group_data(fragment_id, other_group):
                if self._constraints.are_rows_alike_for(row1, row2, fragment_id, constraint_id):
                    return True
        return False

    def _check_group_heterogenity(self, row, fragment_id, group_id):
        for _, other_row in self._get_group_data(fragment_id, group_id):
            if self._constraints.are_rows_alike(row, other_row, fragment_id):
                # logging.trace('GROUP VIOLATED fragment {} group {}'.format(fragment_id, group_id))
                return False
        return True

    def _check_association_heterogenity(self, association, fragment_id, group_id):
        for other_fragment, other_group in enumerate(association):
            if other_fragment != fragment_id:
                if self._associations.exists(fragment_id, group_id, other_fragment, other_group):
                    # logging.trace('ASSOCIATION VIOLATED fragment {} group {}'.format(fragment_id, group_id))
                    return False
        return True

    def _check_deep_heterogenity(self, row, association, fragment_id, group_id):
        return (self._check_deep_heterogenity_of_association(row, association, fragment_id, group_id) and
                self._check_deep_heterogenity_of_associated_groups(row, fragment_id, group_id))

    def _check_deep_heterogenity_of_association(self, row, association, fragment_id, group_id):
        for constraint_id in self._constraints.constraints_for(fragment_id):
            for fragment1 in self._constraints.involved_fragments_for(constraint_id):
                group1 = group_id if (fragment1 == fragment_id) else -1 if fragment1 >= len(association) else association[fragment1]
                for other_association in self._associations.get_associated(fragment1, group1):
                    if other_association != association:
                        for fragment2 in self._constraints.involved_fragments_for(constraint_id):
                            if fragment2 != fragment1:
                                group2 = group_id if (fragment2 == fragment_id) else -1 if fragment2 >= len(association) else association[fragment2]
                                if not self._are_groups_alike(group2, other_association[fragment2], fragment2, constraint_id, row):
                                    break
                        else:
                            # logging.trace("DEEP ASSOCIATION VIOLATED fragment {} group {}".format(fragment_id, group_id))
                            return False
        return True

    def _check_deep_heterogenity_of_associated_groups(self, row, fragment_id, group_id):
        constraints_for = set(self._constraints.constraints_for(fragment_id))
        # take all the pre-existing associations in (fragment_id, group_id)
        for association1 in self._associations.get_associated(fragment_id, group_id):
            # for every fragment1
            for fragment1 in xrange(len(self._fragments)):
                # which is not fragment_id
                if fragment1 != fragment_id:
                    # I already know which constraints can break
                    # only the ones that insists on both fragment_id and fragment1
                    constraints_to_check = constraints_for & set(self._constraints.constraints_for(fragment1))
                    # take another association, associated with (fragment1 , association1[fragment1])
                    for association2 in self._associations.get_associated(fragment1, association1[fragment1]):
                        # which is not association1
                        if association1 != association2:
                            # for every constraint that can break
                            for constraint_id in constraints_to_check:
                                # there must exists a fragment2 (insisting on the constraint)
                                for fragment2 in self._constraints.involved_fragments_for(constraint_id):
                                    # which is not fragment2
                                    if fragment2 != fragment1:
                                        # fow which the constraint holds
                                        if not self._are_groups_alike(association1[fragment2], association2[fragment2],
                                            # if fragment2 is the same as fragment_id, we inform the method are_groups_alike
                                            # about the fact that we want to insert the tuple in that fragment
                                            fragment2, constraint_id, row if (fragment2 == fragment_id) else None):
                                            break
                                else:
                                    # logging.trace("INNER {} {}".format(association1, association2))
                                    return False
        return True

    # fragment_id is needed because it's not always bound to the association length. It is in extend_association
    # but not in redistribute_group. There we want to check just a single fragment from an already complete association.
    def _check_heterogenity(self, row, association, fragment_id, group_id):
        return (self._check_group_heterogenity(row, fragment_id, group_id) and
                self._check_association_heterogenity(association, fragment_id, group_id) and
                self._check_deep_heterogenity(row, association, fragment_id, group_id))

    def _extend_association(self, row, row_id, association=[]):
        fragment_id = len(association)
        if (fragment_id == len(self._fragments)):
            return association

        for group_id in self._get_nonfull_groups(fragment_id, row_id=row_id):
            if self._check_heterogenity(row, association, fragment_id, group_id):
                result = self._extend_association(row, row_id, association + [group_id])
                if result is not None:
                    return result

    def _full_neighbours_groups(self, fragment_id, group_id, step=1):
        return (new_group_id for new_group_id in neighbours(group_id, 0, self._last_usable[fragment_id] + 1)
                if self._associations.is_group_full(fragment_id, new_group_id))
        
    def _redistribute_group(self, fragment_id, group_id, step=1):
        logging.debug('redistributing group {} in fragment {}'.format(group_id, fragment_id))
        for row_id, row in self._get_group_data(fragment_id, group_id):
            association = self._associations[row_id]
            if association:
                for new_group_id in self._full_neighbours_groups(fragment_id, group_id, step):
                    if self._check_heterogenity(row, association, fragment_id, new_group_id):
                        logging.debug('fragment {} row {}: group {} -> group {}'.format(fragment_id, row_id, group_id, new_group_id))
                        new_association = list(association)
                        new_association[fragment_id] = new_group_id
                        self._associations[row_id] = new_association
                        break
                else:
                    logging.debug('fragment {} row {} : group {} -> not reallocable'.format(fragment_id, row_id, group_id))
                    self._opstack.append((1, (row_id, step)))    # delete

    def _delete_row(self, row_id, step=1):
        self._dropped.add(row_id)
        association = self._associations[row_id]
        logging.debug('deleting row {} = {}'.format(row_id, association))
        del self._associations[row_id]
        for fragment_id, group_id in enumerate(association):
            if not self._associations.is_group_full(fragment_id, group_id):
                self._opstack.append((0, (fragment_id, group_id, step + 1)))    # redistribute
        logging.info('row {} deleted (step {})'.format(row_id, step))

    def _update_first_nonfull(self, association):
        for fragment_id, group_id in enumerate(association):
            if group_id == self._first_nonfull[fragment_id]:
                while (self._associations.is_group_full(fragment_id, self._first_nonfull[fragment_id])):
                    self._first_nonfull[fragment_id] += 1

    def _update_last_usable(self, association):
        for fragment_id, group_id in enumerate(association):
            if group_id >= self._last_usable[fragment_id]:
                self._last_usable[fragment_id] = group_id + 1

    def _update_pointers(self, association):
        self._update_first_nonfull(association)
        self._update_last_usable(association)

    def associate(self, k_list, skip_probability=0, **kwargs):
        assert(len(k_list) == len(self._fragments))
        print_stats_every = kwargs.get('print_stats_every', 1000)

        self._k_list = k_list
        self._first_nonfull = [0 for _ in xrange(len(self._fragments))]
        self._last_usable = [0 for _ in xrange(len(self._fragments))]
        self._associations = Associations(self._k_list)
        self._dropped = set()
        self._skip_probability = skip_probability

        withtime = make_withtime()
        for row_id, row in enumerate(self._table):
            if row_id and not row_id % print_stats_every:
                logging.info(withtime('associating row: %i firsts: %s lasts: %s'
                    % (row_id, self._first_nonfull, self._last_usable)))

            association = self._extend_association(row, row_id)
            if association is None:
                self._dropped.add(row_id)
                logging.info('row_id {} dropped at first scan'.format(row_id))
            else:
                self._associations[row_id] = association
                self._update_pointers(association)

        self._opstack = deque()
        for fragment_id in xrange(len(self._fragments)):
            for group_id in self._get_nonfull_groups(fragment_id, False):
                if self._associations.get_group_size(fragment_id, group_id):
                    self._opstack.append((0, (fragment_id, group_id)))    # redistribute

        counter = count()
        while self._opstack:
            if not next(counter) % 10:
                logging.info(withtime('opstack length: %i' % len(self._opstack)))
            operation = self._opstack.pop()
            fn = self._delete_row if operation[0] else self._redistribute_group
            fn(*operation[1])

        logging.info(withtime('done after %i stack operations' % (next(counter))))
        return self._associations, self._dropped

    def associate_with_retries(self, k_list, retries, skip_probability=0.05, **kwargs):
        for i in xrange(retries):
            associations, dropped = self.associate(k_list, skip_probability, **kwargs)
            if not dropped:
                logging.info('Found after {} iterations\nSolution: {}'.format(i, associations))
                return associations, i
        logging.info('No solution found')

    def print_statistics(self):
        for fragment_id in xrange(len(self._fragments)):
            print '\nFragment {}'.format(fragment_id)
            for length, count in Counter(group_size for group_id in xrange(self._last_usable[fragment_id])
                                         for group_size in (self._associations.get_group_size(fragment_id, group_id),)
                                         if group_size is not 0).items():
                print '{} elements: {} groups'.format(length, count)
        print '\n{} ({:.3%}) lines dropped: {}'.format(len(self._dropped), float(len(self._dropped)) / self.tuples, self._dropped)
예제 #4
0
class BLParent():
	"""docstring for BLParent"""
	__conf = None
	__associations = None
	__sites = None
	
	resultList = None
	masterInspectionPath = None

	def __init__(self):
		self.__conf = Configure()
		self.__associations = Associations()
		self.__sites = Sites()
		resultList = []

		xReader = XMLReader()
		xParser = XMLParser()
		confTree = xReader.getTree('xml/conf.xml')
		if confTree == None:
			exit()
		searchParams = xParser.getSearchParams(confTree)
		searchSites = xParser.getSearchSites(confTree)
		pagesToSearch = xParser.getPagesToSearch(confTree)
		self.masterInspectionPath = xParser.getMIXML(confTree)

		self.__conf.setParams(searchSites, searchParams, pagesToSearch)

		keywordTree = xReader.getTree('xml/keywords.xml')
		fKeywordTree = xReader.getTree('xml/f_keywords.xml')
		if keywordTree == None or fKeywordTree == None:
			exit()
		keywords = xParser.getKeywords(keywordTree)
		fKeywords = xParser.getKeywords(fKeywordTree)
		avoids = xParser.getAvoids(keywordTree)
		fAvoids = xParser.getAvoids(fKeywordTree)

		self.__associations.setParams(keywords, avoids, fKeywords, fAvoids)

		sitesTree = xReader.getTree('xml/sites.xml')
		if sitesTree == None:
			exit()
		goodSites, badSites = xParser.getSites(sitesTree)

		self.__sites.setParams(goodSites, badSites)

	def startSubProcesses(self):
		CM = ConnectionManager()
		lt = ListTool()
		sitesList = []
		sitesList = lt.addOnlyUniqueFromList(self.__sites.goodSites, self.__sites.badSites)

		CM.initializeConnection(	self.__associations.keywordsList, 	self.__associations.avoidsList, 
									sitesList, 	self.__conf.siteToSearchList,	 self.__conf.pagesToSearch, 
									self.__conf.searchParamsList)
		CM.startThread()
		CM.join()
		CM.parseResults()
		
		self.resultList = CM.getResults()		

	def createMasterInspectionXML(self, delChildXMLs = False):
		lt = ListTool()
		os = OSTool()
		sort = Sorter()
		insp = Inspector()

		xmls = os.getFilesInDir('results/')
		xmls = lt.popByWord(xmls, self.masterInspectionPath)

		XMLInspections = insp.getInspections(xmls)				

		if len(XMLInspections) == 0:
			print('No files read.')
			exit()

		XMLInspections = sort.sortInspectionList(XMLInspections)

		xWriter = XMLWriter()
		xWriter.writeMIXML(XMLInspections, self.masterInspectionPath)

		if delChildXMLs:
			for xml in xmls:
				os.deleteFile(xml)

	def startServerProg(self):
		os = OSTool()
		os.startProgram('google-chrome', 'localhost:80/tracker/')