def getTop(num=100, highlight_funcs=[], remove_funcs=[]): """ Get the top [amount] filtered URLS for a case. This has been optimised as one query with a subquery. """ filter = getFilter(highlight_funcs, remove_funcs) url2 = aliased(URL) subq = session.query(url2.domain, func.count(url2.domain).label('domain_count'))\ .join(filter).filter(url2.domain != None)\ .group_by(url2.domain).order_by(desc(func.count(url2.domain))) if num != "all": subq = subq.limit(num) subq = subq.subquery() q = session.query(URL.netloc, func.count(1), subq.c.domain, subq.c.domain_count)\ .join((subq, URL.domain == subq.c.domain))\ .join(filter)\ .group_by(URL.netloc)\ .order_by(desc(subq.c.domain_count), asc(subq.c.domain), asc(URL.netloc)) domains = [] for netloc, netloc_count, domain, domain_count in q: if len(domains) == 0 or domains[-1][0] != domain: domains.append((domain, domain_count, [])) domains[-1][2].append((netloc, netloc_count)) return domains
def getAll(cls): """ Return all the objects. """ try: q = session.query(cls).order_by(asc(cls.id)) except AttributeError: q = session.query(cls).order_by(asc(cls.entry_id)) return q
def filesAccessed(): """ Returns a file directory tree and the total amount of files accessed. The tree is a dictionary of drives, each drive letter is the key and the value is another dictionary with the keys being file types. For each file type, the value is a tree structure with the end point being a file name, the number of times the file was accessed and a list of access dates. .. note:: This does not allow for Linux etc drives, only Windows one letter drives. **ToDo**: re-optimise this query, more things were done making it slow again for larger datasets. **ToDo**: Allow other operating systems file systems. """ q = session.query(URL, func.count(1)).filter(URL.scheme == "file").order_by(func.count(1)) q = q.group_by(URL.path) total = q.count() drives = {} for file, count in q: path = urllib.unquote(file.path[1:]) # hack to get dates working: needs to be made into above query ^ dates = session.query(Entry.access_date, Entry.access_time).join(URL)\ .filter(URL.path==file.path)\ .order_by(desc(Entry.access_date), desc(Entry.access_time)) if path[1] == ":": drive = path[0] else: continue #don't deal with Linux yet ending = path.rsplit('.',1) type = FILE_TYPES.get(ending[-1].strip().lower(),'other file') if drive not in drives: drives[drive] = ({}, 0) if type not in drives[drive][0]: drives[drive][0][type] = ({}, 0) drives[drive] = (drives[drive][0], drives[drive][1] + count) drives[drive][0][type] = (drives[drive][0][type][0], drives[drive][0][type][1] + count) Entry._buildTree(drives[drive][0][type][0], (path, count, dates)) return drives, total
def peakTime(): """ Returns the peak time of web browser usage. Similar to the heatmap, although doesn't do for each day of the week - sums up each hour and returns the highest. """ step = 1 init = datetime(1, 1, 1) start_time = init.time() until_time = None highest = 0 timeperiod = None while(start_time < t(24-step,0,0)): start_time = init.time() until_time = (init + timedelta(hours=step) - timedelta(milliseconds=1)).time() q = session.query(Entry).filter(Entry.access_time >= start_time)\ .filter(Entry.access_time <= until_time).count() if q > highest: highest = q timeperiod = start_time.strftime('%H:%M - ') + until_time.strftime('%H:%M') init = init + timedelta(hours=step) return timeperiod
def getAllEntries(case): """ Returns all the entries for this case where the access time and date are not `None`. """ q = session.query(Entry).filter(Entry.access_time!=None).filter(Entry.access_date!=None).\ join('group').filter_by(case_id=case.id) return q
def setDomain_manual(group): """ Manually set the domain names. Used in the optimised adding of URLs which bypasses creating URL objects using `__init__()` """ q = session.query(URL).join('entry').filter_by(group_id=group.id).all() for row in q: row.setDomain()
def getEndDate(self): """ Get the ending date of all the group's entries. """ return self._formatDate(session.query(Entry).filter_by(group_id=self.id).\ filter(Entry.access_date!=None).filter(Entry.access_time!=None).\ order_by(desc(Entry.access_date), desc(Entry.access_time)).\ first().access_date)
def onlineSearches(engine, highlight_funcs=[], remove_funcs=[], num=20): """ Given a search engine, amount and filters, returns a word cloud dictionary of search terms. The keys are the terms and the values are a (ratio, search phrases term was in, length of search phrases) tuple. Also returns total amount of terms, total number of unique terms and the smallest ratio (used to make the sizes of the words relative to smallest). """ filter = getFilter(highlight_funcs, remove_funcs) searchterms2 = aliased(SearchTerms) subq = session.query(searchterms2.id).filter(searchterms2.engine==engine)\ .join(searchterms2.entries).join((filter, Entry.id == filter.c.id))\ .group_by(searchterms2.id)\ .order_by(desc(searchterms2.occurrence)) if num != "all": subq = subq.limit(num) subq = subq.subquery() q = session.query(SearchTerms)\ .join((subq, subq.c.id == SearchTerms.id))\ .join(SearchTerms.entries)\ .join((filter, Entry.id == filter.c.id))\ .join(Entry.parsedurl).filter(SearchTerms.engine==engine)\ .options(contains_eager('entries'), contains_eager('entries.parsedurl'))\ .order_by(desc(Entry.access_date), desc(Entry.access_time)) terms = q.all() total_terms = float(sum(len(t.entries) for t in terms)) unique_terms = len(terms) cloud = {} smallest = 1.0 for term in terms: search_strings = [(entry.parsedurl.search, entry.timeline_date) for entry in term.entries] ratio = len(search_strings)/total_terms # Hack to stop javascript name-space clash. TODO: Fix this in nicer way. cloud['_' + term.term] = (ratio, search_strings, len(search_strings)) if ratio < smallest: smallest = ratio return cloud, total_terms, unique_terms, smallest
def getPercentages(): """ Used in the overview statistics. Returns a list of `(browser object, percent)` tuples for each browser where percent is the percentage this browser is used in all the entries. """ q = session.query(Browser.name, func.count(1)).join('entries').group_by(Browser.name).all() total = 0.0 for browser, count in q: total = total + count return [(browser, count/total) for browser, count in q]
def averagePages(): """ Returns the average number of websites visited in one day, i.e. only http and http entries. """ q = session.query(Entry.access_date, func.count(1)).join('parsedurl')\ .filter(or_(URL.scheme=="http", URL.scheme=="https")).group_by(Entry.access_date) total = q.count() sums = [row1 for row0, row1 in q] if total != 0: return int(sum(sums)/float(total)) else: return 0
def getTimeGraph(startdate, enddate, starttime, endtime, remove_funcs, highlight_funcs, remove_duplicates=False, duplicate_time=0): """ Given a start and end date and time and filters, returns three lists of entry points: `highlighted`, `not_highlighted` and `removed` by calling `filter_queries()` Each entry will be in one of those lists for Flot to draw. """ # make the queries # ---------------- q = session.query(Entry.access_date, Entry.access_time, Entry.url, Browser.name, Browser.version, Browser.source, Group.program, Entry.title) # join onto all the other tables q = q.join('browser').join('parsedurl').join('group').outerjoin(Entry.search_terms) # filter by the start and end dates and times q = q.filter(Entry.access_date >= startdate).filter(Entry.access_date <= enddate) q_removed, q_highlighted, q_not_highlighted = Case.filter_queries(q, remove_funcs, highlight_funcs) removed = (q_removed.order_by(asc(Entry.access_date), asc(Entry.access_time)), []) highlighted = (q_highlighted.order_by(asc(Entry.access_date), asc(Entry.access_time)), []) not_highlighted = (q_not_highlighted.order_by(asc(Entry.access_date), asc(Entry.access_time)), []) # put the results in the format Flot wants it # ------------------------------------------- for (q, entries) in [removed, highlighted, not_highlighted]: seen = set() for cols in q: plot = get_plotable(*cols) if remove_duplicates == True: # remove duplicates if plot not in seen: if len(entries) == 0: entries.append(plot) # add first entry regardless seen.add(plot) else: if (entries[-1][0] == plot[0] and \ (plot[1] - entries[-1][1]) >= float(duplicate_time)/float(60)) \ or (entries[-1][0] < plot[0]): # only add entries <duplicate_time> minutes apart entries.append(plot) seen.add(plot) else: # else add everything entries.append(plot) return highlighted[1], not_highlighted[1], removed[1]
def testfilter_queries(self): date = datetime(2010, 5, 1) q = session.query(Entry.url).join('browser').join('parsedurl').\ join('group').outerjoin(Entry.search_terms).\ filter(Entry.access_date==date) rem = FilterQuery() rem.add_element('URL Parts','domain','Is','bbc.co.uk', None) remove_funcs = [rem] high = FilterQuery() high.add_element('URL Parts','domain','Is','google.com',None) highlight_funcs = [high] qrem, qhigh, qnot = Case.filter_queries(q, remove_funcs, highlight_funcs) self.assertEqual(qrem.count(), 4) self.assertEqual(qhigh.count(), 10) self.assertEqual(qnot.count(), 15)
def getFilter(highlight_funcs=[], remove_funcs=[]): """ Get the right filter. This is used to filter domain names and search terms correctly. `Case.filter_queries` is called, and if `highlight_funcs` != [] then return the highlighted query, otherwise return the non-highlighted query. """ filter_q = session.query(Entry.id)\ .join(Entry.parsedurl, Entry.browser, Entry.group)\ .outerjoin(Entry.search_terms) q_removed, q_highlighted, q_not_highlighted = \ Case.filter_queries(filter_q, remove_funcs, highlight_funcs) if highlight_funcs: filter_q = q_highlighted else: filter_q = q_not_highlighted return filter_q.group_by(Entry.id).subquery()
def generateHeatMap(): """ Returns heatmap things for the overview: the table headers, the heatmap table and the highest and lowest values (used to calculate heatmap colour). """ headers = ['Mon', 'Tue','Wed','Thu','Fri','Sat','Sun'] rows = [] step = 1 # number of hours each row in table represents init = datetime(1, 1, 1) start_time = init.time() until_time = None highest = 0 lowest = 0 while(start_time < t((24-step),0,0)): start_time = init.time() until_time = (init + timedelta(hours=step) - timedelta(milliseconds=1)).time() title = start_time.strftime('%H:%M - ') + until_time.strftime('%H:%M') values = [] for weekday in [1,2,3,4,5,6,0]: # '%w' gives weekdays starting on sunday q = session.query(Entry)\ .filter(func.strftime('%w', Entry.access_date) == str(weekday))\ .filter(Entry.access_time >= start_time)\ .filter(Entry.access_time <= until_time).count() if q > highest: highest = q if q < lowest: lowest = q values.append(q) rows.append((title, values)) init = init + timedelta(hours=step) return headers, rows, highest, lowest
def getFilterBy(cls, **vars): """ Return a list of objects which satisfy the conditions in **vars E.g. `Case.getFilterBy(name="Foo")` """ return session.query(cls).filter_by(**vars)
def getAmount(cls): """ Return the number of objects. """ return session.query(cls).count()
def get_case(): """ Get the current case. """ return session.query(Case).first()
def get(cls, id): """ Return the object with the given id. """ return session.query(cls).get(id)
def getNumEntries(self): """ Get amount of entries for this group. """ return session.query(Entry).filter_by(group_id=self.id).count()