class Parrott: def __init__(self): ''' Births a new Parrott, with a Naive Bayes brain and a Solr memory. ''' self.brain = naive_bayes.NaiveBayes() self.twitter = membrane.twitter.api() self.memory = Memory() def train(self): ''' Trains the Parrott on examples stored in Memory (Solr) ''' # Fetch 1000 positive and negative examples. pos_tweets = self.memory.recall_positive(0, 1000) neg_tweets = self.memory.recall_negative(0, 1000) # Pull out the proper text for training data. pos = [' '.join([tweet['tweet'], tweet['user']]) for tweet in pos_tweets] neg = [' '.join([tweet['tweet'], tweet['user']]) for tweet in neg_tweets] # Train the brain. self.brain.train(pos, neg) def test(self, pos_set, neg_set, threshold=0.8): ''' Tests the Parrott on a set of positive and negative examples. Args: pos_set (list): positive test examples neg_set (list): negative test examples threshold (float): testing threshold Returns: dict of test results. ''' return self.brain.test(threshold, pos_set, neg_set) def classify(self, tweet): ''' Classifies a Tweet. Args: tweet (string): the Tweet content Returns: probability of positive classification. ''' return self.brain.classify(tweet) def audit(self, tweet, pos): ''' Deletes an existing Tweet from Memory and replaces it with the updated Tweet. Args: tweet (dict): the Tweet to audit pos (bool): is the Tweet is a positive example? ''' # Solr cannot "update", just delete & add. self.memory.forget(tweet) # Update the tweet. tweet.update({'positive':pos, 'audited':True}) # Clear id and _version_ to prevent conflicts. del tweet['id'] del tweet['_version_'] # Re-memorize. self.memory.memorize(tweet)
class Importer(Thread): skip_dirs = ['Originals', 'Thumb', re.compile(r'^\..*')] def __init__(self, frame, sources, opts): Thread.__init__(self) self.started_at = time.time() self.interrupt = Event() self.frame = frame self.opts = opts self.source_dirs = [] self.source_files = [] for s in sources: if os.path.isdir(s): self.source_dirs.append(s) elif os.path.isfile(s): self.source_files.append(os.path.abspath(s)) self.dest_dirs = parse_dest_dirs(opts.dest_dirs) self.already_imported = Memory() self.__msg("%sImporting media from %s" % (("" if self.dest_dirs else "Not "), ", ".join(self.source_dirs + self.source_files))) if self.dest_dirs: self.start() def __msg(self, s, min_verbosity = 1): try: self.__service_interrupt() if self.opts.verbosity >= min_verbosity: wx.CallAfter(self.frame.logger, s) except wx.PyDeadObjectError: self.interrupt.set() def __dmsg(self, s, min_verbosity = 1): if self.opts.dry_run: self.__msg("NOT %s" % (s,), min_verbosity) else: self.__msg(s, min_verbosity) def __twiddle(self, mode): self.__service_interrupt() if self.opts.verbosity > 1: wx.CallAfter(self.frame.twiddle, mode) def __start(self): self.__twiddle(0) def __advance(self): self.__twiddle(1) def __complete(self): self.__twiddle(2) def __service_interrupt(self): if self.interrupt.is_set(): raise UserWarning("Raise Thread Quitting") def __find_media(self): from idir import idirs import itertools def desc_dirs(action, tup): if action == "dir": (dirpath, fcount, dcount) = tup fdets = "%d file%s" % (fcount, ("" if fcount == 1 else "s")) ddets = "%d sub-director%s" % (dcount, ("y" if dcount == 1 else "ies"),) if fcount > 0: if dcount > 0: para = " (%s & %s)" % (fdets, ddets) else: para = " (%s)" % (fdets,) elif dcount > 0: para = " (%s)" % (ddets,) else: para = "" self.__msg("Scanning directory %s%s" % (dirpath, para), 2) elif action == "skip": (skipped) = tup self.__msg("Skipping dir %s" % (skipped,), 2) ret = [] self.__start() paths = idirs(self.source_dirs, desc_dirs, Importer.skip_dirs) if self.source_files: paths = itertools.chain(self.source_files, paths) paths = itertools.ifilter(MediaMetadata.has_media_suff, paths) mds = map(lambda path: MediaMetadata(path), paths) if self.opts.skip_already_imported and self.opts.forget is False: mds = itertools.ifilterfalse(lambda md: self.already_imported.known(md.digest()), mds) self.__complete() return mds def __examine_media(self, mds): # Now examine their EXIF data, if we can media_details = list(mds) media_count = len(media_details) ret = {} date_count = 0 self.__msg("Found %s file%s (Not already inspected), now getting shot date info" % (media_count, ("" if media_count == 1 else "s"))) self.__start() for md in media_details: self.__advance() dirpath = md.dirname base = md.basename suff = md.suffix fname = md.path date = md.get_date() if date: date_count += 1 l = ret.get(date, []) l.append((dirpath, fname, md)) ret[date] = l self.__complete() return (ret, date_count) def __runner(self): (media, date_count) = self.__examine_media(self.__find_media()) self.__msg("Found shot date info of %s file%s" % (date_count, ("" if date_count == 1 else "s"))) dates = media.keys() dates.sort() n = 0 for date in dates: (year, month, day) = date files = media[date] YY = '%02d' % year date_dir = '%s_%02d_%02d' % (YY, month, day) if self.opts.forget: for (dirpath, fname, src_md) in files: hexdigest = src_md.digest() if self.already_imported.known(hexdigest): self.__dmsg("Forgetting %s" % (os.path.join(dirpath, fname),)) if not self.opts.dry_run: self.already_imported.forget(hexdigest) self.already_imported.commit() else: def desc_copies(action, tup): (md, dest) = tup kind = md.kind() src = md.path mention_dest = (" to %s" % (dest,)) if self.opts.verbosity > 1 else "" if action == "import": self.__dmsg("Importing %s [file %d of %d] %s%s" % (kind, n, date_count, src, mention_dest)) elif action == "already": self.__msg("%s [file %d of %d] %s already imported%s" % (kind, n, date_count, src, mention_dest)) #elif action == "mkdir": #self.__dmsg("Creating directory %s" % (dest,), 2) #elif action == "!mkdir": #self.__msg("Directory %s already exists" % (dest,), 2) for (dirpath, fname, src_md) in files: n += 1 for dest_dir in self.dest_dirs: d = os.path.join(dest_dir, YY, date_dir) if src_md.copy_to(d, self.opts.dry_run, desc_copies): self.already_imported.remember(src_md.digest()) tdelta = time.time() - self.started_at self.__msg("All done in %.01f second%s!" % (tdelta, "" if tdelta == 1 else "s")) def run(self): from traceback import print_exc try: self.__runner() except Exception, e: print_exc() print "Exception %s" % (str(e),)