Example #1
0
    def getTop(num=100, highlight_funcs=[], remove_funcs=[]):
        """
            Get the top [amount] filtered URLS for a case. This has been optimised as one 
            query with a subquery. 
        """        

        filter = getFilter(highlight_funcs, remove_funcs)
        
        url2 = aliased(URL)
        subq = session.query(url2.domain, func.count(url2.domain).label('domain_count'))\
                .join(filter).filter(url2.domain != None)\
                .group_by(url2.domain).order_by(desc(func.count(url2.domain)))

        if num != "all":
            subq = subq.limit(num)           
        subq = subq.subquery()

        q = session.query(URL.netloc, func.count(1), subq.c.domain, subq.c.domain_count)\
            .join((subq, URL.domain == subq.c.domain))\
            .join(filter)\
            .group_by(URL.netloc)\
            .order_by(desc(subq.c.domain_count), asc(subq.c.domain), asc(URL.netloc))

        domains = []
        for netloc, netloc_count, domain, domain_count in q:
            if len(domains) == 0 or domains[-1][0] != domain:
                domains.append((domain, domain_count, []))
                
            domains[-1][2].append((netloc, netloc_count))
          
        return domains 
Example #2
0
 def getAll(cls):
     """ 
         Return all the objects.
     """
     try:
         q = session.query(cls).order_by(asc(cls.id))
     except AttributeError:
         q = session.query(cls).order_by(asc(cls.entry_id))
     return q
Example #3
0
    def filesAccessed():
        """
            Returns a file directory tree and the total amount of files accessed. The tree
            is a dictionary of drives, each drive letter is the key and the value is another dictionary 
            with the keys being file types. For each file type, the value is a tree structure with the
            end point being a file name, the number of times the file was accessed and a list of 
            access dates.
                    
            .. note::
                This does not allow for Linux etc drives, only Windows one letter drives.
                
                **ToDo**: re-optimise this query, more things were done making it slow again for 
                larger datasets. 
            
                **ToDo**: Allow other operating systems file systems. 
        """
        q = session.query(URL, func.count(1)).filter(URL.scheme == "file").order_by(func.count(1))
        q = q.group_by(URL.path)
        
        total = q.count()
         
        drives = {}
        
        for file, count in q:
            path = urllib.unquote(file.path[1:])
            
            # hack to get dates working: needs to be made into above query ^
            dates = session.query(Entry.access_date, Entry.access_time).join(URL)\
                .filter(URL.path==file.path)\
                .order_by(desc(Entry.access_date), desc(Entry.access_time))
            
            if path[1] == ":":
                drive = path[0]
            else:
                continue #don't deal with Linux yet
            
            ending = path.rsplit('.',1)
            type = FILE_TYPES.get(ending[-1].strip().lower(),'other file')


            if drive not in drives:           
                drives[drive] = ({}, 0)
            
            if type not in drives[drive][0]:
                drives[drive][0][type] = ({}, 0)
                
            drives[drive] = (drives[drive][0], drives[drive][1] + count)
            drives[drive][0][type] = (drives[drive][0][type][0], drives[drive][0][type][1] + count)
     
            Entry._buildTree(drives[drive][0][type][0], (path, count, dates))
                            
        return drives, total    
Example #4
0
    def peakTime():  
        """
            Returns the peak time of web browser usage. Similar to the heatmap, although doesn't 
            do for each day of the week - sums up each hour and returns the highest.
        """     
        step = 1
        
        init = datetime(1, 1, 1)
        start_time = init.time()
        until_time = None
        
        highest = 0
        timeperiod = None
        while(start_time < t(24-step,0,0)):
            start_time = init.time()
            until_time = (init + timedelta(hours=step) - timedelta(milliseconds=1)).time()
        
            q = session.query(Entry).filter(Entry.access_time >= start_time)\
                    .filter(Entry.access_time <= until_time).count()

            if q > highest:
                highest = q
                timeperiod = start_time.strftime('%H:%M - ') + until_time.strftime('%H:%M')
                
            init = init + timedelta(hours=step)
        return timeperiod  
Example #5
0
 def getAllEntries(case):
     """
         Returns all the entries for this case where the access time and date are not `None`.
     """
     q = session.query(Entry).filter(Entry.access_time!=None).filter(Entry.access_date!=None).\
         join('group').filter_by(case_id=case.id)
     return q                        
Example #6
0
 def setDomain_manual(group):
     """
         Manually set the domain names. Used in the optimised adding of URLs which bypasses 
         creating URL objects using `__init__()`
     """
     q = session.query(URL).join('entry').filter_by(group_id=group.id).all()
     for row in q:
         row.setDomain()
Example #7
0
 def getEndDate(self):
     """
         Get the ending date of all the group's entries.
     """
     return self._formatDate(session.query(Entry).filter_by(group_id=self.id).\
                             filter(Entry.access_date!=None).filter(Entry.access_time!=None).\
                             order_by(desc(Entry.access_date), desc(Entry.access_time)).\
                             first().access_date)
Example #8
0
    def onlineSearches(engine, highlight_funcs=[], remove_funcs=[], num=20):
        """
            Given a search engine, amount and filters, returns a word cloud dictionary of search terms.
            The keys are the terms and the values are a (ratio, search phrases term was in, 
            length of search phrases) tuple. Also returns total amount of terms, total number of unique 
            terms and the smallest ratio (used to make the sizes of the words relative to smallest).
        """
        
        filter = getFilter(highlight_funcs, remove_funcs)

        searchterms2 = aliased(SearchTerms)
        subq = session.query(searchterms2.id).filter(searchterms2.engine==engine)\
            .join(searchterms2.entries).join((filter, Entry.id == filter.c.id))\
            .group_by(searchterms2.id)\
            .order_by(desc(searchterms2.occurrence))
        
        if num != "all":
            subq = subq.limit(num)
                
        subq = subq.subquery()
        
        q = session.query(SearchTerms)\
            .join((subq, subq.c.id == SearchTerms.id))\
            .join(SearchTerms.entries)\
            .join((filter, Entry.id == filter.c.id))\
            .join(Entry.parsedurl).filter(SearchTerms.engine==engine)\
            .options(contains_eager('entries'), contains_eager('entries.parsedurl'))\
            .order_by(desc(Entry.access_date), desc(Entry.access_time))            
        
        
        terms = q.all()
        total_terms = float(sum(len(t.entries) for t in terms))        
        unique_terms = len(terms)
        cloud = {}
        smallest = 1.0
        for term in terms:
            search_strings = [(entry.parsedurl.search, entry.timeline_date) for entry in term.entries]
            ratio = len(search_strings)/total_terms
            # Hack to stop javascript name-space clash. TODO: Fix this in nicer way.
            cloud['_' + term.term] = (ratio, search_strings, len(search_strings))
            if ratio < smallest:
                smallest = ratio
        return cloud, total_terms, unique_terms, smallest
Example #9
0
 def getPercentages():
     """
         Used in the overview statistics. Returns a list of `(browser object, percent)` tuples 
         for each browser where percent is the percentage this browser is used in all the entries.
     """
     q = session.query(Browser.name, func.count(1)).join('entries').group_by(Browser.name).all()
     total = 0.0
     for browser, count in q:
         total = total + count
     return [(browser, count/total) for browser, count in q]
Example #10
0
 def averagePages():
     """
         Returns the average number of websites visited in one day, i.e. only http
         and http entries. 
     """
     q = session.query(Entry.access_date, func.count(1)).join('parsedurl')\
         .filter(or_(URL.scheme=="http", URL.scheme=="https")).group_by(Entry.access_date)
     total = q.count()
     sums = [row1 for row0, row1 in q]
     if total != 0:
         return int(sum(sums)/float(total))
     else:
         return 0  
Example #11
0
    def getTimeGraph(startdate, enddate, starttime, endtime, remove_funcs, highlight_funcs,
               remove_duplicates=False, duplicate_time=0):
        """
            Given a start and end date and time and filters, returns three lists of entry points:
            `highlighted`, `not_highlighted` and `removed` by calling `filter_queries()`
            Each entry will be in one of those lists for Flot to draw.  
        """
        
        # make the queries
        # ----------------
        q = session.query(Entry.access_date, Entry.access_time, Entry.url, Browser.name,
                          Browser.version, Browser.source, Group.program, Entry.title)
        
        # join onto all the other tables
        q = q.join('browser').join('parsedurl').join('group').outerjoin(Entry.search_terms)
        
        # filter by the start and end dates and times
        q = q.filter(Entry.access_date >= startdate).filter(Entry.access_date <= enddate)
        
        q_removed, q_highlighted, q_not_highlighted = Case.filter_queries(q, remove_funcs, highlight_funcs)

        removed = (q_removed.order_by(asc(Entry.access_date), asc(Entry.access_time)), [])
        highlighted = (q_highlighted.order_by(asc(Entry.access_date), asc(Entry.access_time)), []) 
        not_highlighted = (q_not_highlighted.order_by(asc(Entry.access_date), asc(Entry.access_time)), [])

        # put the results in the format Flot wants it
        # -------------------------------------------
        for (q, entries) in [removed, highlighted, not_highlighted]:
            seen = set()
            for cols in q:   
                plot = get_plotable(*cols)

                if remove_duplicates == True: # remove duplicates                
                    if plot not in seen:
                        if len(entries) == 0:
                            entries.append(plot)   # add first entry regardless
                            seen.add(plot)
                        else:
                            if (entries[-1][0] == plot[0] and \
                            (plot[1] - entries[-1][1]) >= float(duplicate_time)/float(60)) \
                            or (entries[-1][0] < plot[0]): 
                                # only add entries <duplicate_time> minutes apart
                                entries.append(plot)
                                seen.add(plot)
                else: # else add everything
                    entries.append(plot)
                
        return highlighted[1], not_highlighted[1], removed[1]
Example #12
0
 def testfilter_queries(self):
     date = datetime(2010, 5, 1)
     q = session.query(Entry.url).join('browser').join('parsedurl').\
         join('group').outerjoin(Entry.search_terms).\
         filter(Entry.access_date==date)
     rem = FilterQuery()   
     rem.add_element('URL Parts','domain','Is','bbc.co.uk', None) 
     remove_funcs = [rem]
     high = FilterQuery()
     high.add_element('URL Parts','domain','Is','google.com',None) 
     highlight_funcs = [high]
     qrem, qhigh, qnot = Case.filter_queries(q, remove_funcs, highlight_funcs) 
     
     self.assertEqual(qrem.count(), 4)
     self.assertEqual(qhigh.count(), 10)
     self.assertEqual(qnot.count(), 15)  
Example #13
0
def getFilter(highlight_funcs=[], remove_funcs=[]):
    """
        Get the right filter. This is used to filter domain names and search terms correctly.
        `Case.filter_queries` is called, and if `highlight_funcs` != [] then return
        the highlighted query, otherwise return the non-highlighted query. 
    """
    filter_q = session.query(Entry.id)\
                .join(Entry.parsedurl, Entry.browser, Entry.group)\
                .outerjoin(Entry.search_terms)                    

    q_removed, q_highlighted, q_not_highlighted = \
        Case.filter_queries(filter_q, remove_funcs, highlight_funcs)        
        
    if highlight_funcs:
        filter_q = q_highlighted 
    else:
        filter_q = q_not_highlighted
        
    return filter_q.group_by(Entry.id).subquery()
Example #14
0
    def generateHeatMap():
        """
            Returns heatmap things for the overview: the table headers, the heatmap
            table and the highest and lowest values (used to calculate heatmap colour).
        """
        headers = ['Mon', 'Tue','Wed','Thu','Fri','Sat','Sun']
        rows = []
        step = 1 # number of hours each row in table represents
        
        init = datetime(1, 1, 1)
        start_time = init.time()
        until_time = None
        
        highest = 0
        lowest = 0
        
        while(start_time < t((24-step),0,0)):
            start_time = init.time()
            until_time = (init + timedelta(hours=step) - timedelta(milliseconds=1)).time()
            
            title = start_time.strftime('%H:%M - ') + until_time.strftime('%H:%M')
            
            values = []
            for weekday in [1,2,3,4,5,6,0]: # '%w' gives weekdays starting on sunday
                q = session.query(Entry)\
                    .filter(func.strftime('%w', Entry.access_date) == str(weekday))\
                    .filter(Entry.access_time >= start_time)\
                    .filter(Entry.access_time <= until_time).count()

                if q > highest:
                    highest = q
                if q < lowest:
                    lowest = q
                
                values.append(q)
                
            rows.append((title, values))
            init = init + timedelta(hours=step)

        return headers, rows, highest, lowest   
Example #15
0
 def getFilterBy(cls, **vars):
     """
         Return a list of objects which satisfy the conditions in **vars
         E.g. `Case.getFilterBy(name="Foo")`
     """
     return session.query(cls).filter_by(**vars)
Example #16
0
 def getAmount(cls):
     """
         Return the number of objects.
     """
     return session.query(cls).count()
Example #17
0
 def get_case():
     """
         Get the current case.
     """
     return session.query(Case).first()
Example #18
0
 def get(cls, id):
     """
         Return the object with the given id.
     """
     return session.query(cls).get(id)
Example #19
0
 def getNumEntries(self):
     """
         Get amount of entries for this group.
     """
     return session.query(Entry).filter_by(group_id=self.id).count()