def add_content_features(featuredict, content): """Add features from note content. Derive the following features from note content and add them to the featuredict with binary values: CONTENT-TOKEN-<token>: Set for each unique, case-folded token in the note content (not including markup). CONTENT-MEDIA-<mimetype>: Set for each mimetype used for media in the note. CONTENT-HASLINK: Set if the note contains one or more links. CONTENT-LINK-<domain>: Set with the domain of each link in the note. CONTENT-TODO: Set if the note contains a todo. Args: featuredict: A dict. content: File-like object containing the note content. """ parser = etree.HTMLParser() root = etree.parse(content, parser).getroot() string_content = unicode(root.xpath('string()')) for token in Tokeniser.split(string_content): featuredict["CONTENT-TOKEN-%s" % token.lower()] = 1 for media in root.iterfind(".//en-media"): featuredict["CONTENT-MEDIA-%s" % media.get("type")] = 1 for link in root.iterfind(".//a"): url = link.get("href") if url is not None: featuredict["CONTENT-HASLINK"] = 1 netloc = urlparse(link.get("href")).netloc if netloc: featuredict["CONTENT-LINK-%s" % netloc] = 1 if root.find(".//en-todo") is not None: featuredict["CONTENT-TODO"] = 1
def add_metadata_features(featuredict, note): """Add features from note metadata. Derive the following features from the Note and add them to the featuredict with binary values: META-TITLETOKEN-<token>: Set for each unique, case-folded token in the note title. META-URL-<domain>: Set with the domain of the note URL, if one is provided. META_HASURL: Set if the note has a URL. META-HASLOCATION: Set if the note has a latitude. META-SOURCE-<source>: Set with the source of the note, if it is provided. META-PLACE-<place>: Set with the place name of the note, if it is provided. META-CONTENTCLASS-<class>: Set with the content class of the note, if it is provided. Args: featuredict: A dict. note: Note object. """ for token in Tokeniser.split(unicode(note.title, encoding="utf-8")): featuredict["META-TITLETOKEN-%s" % token.lower()] = 1 if note.attributes.sourceURL: netloc = urlparse(note.attributes.sourceURL).netloc if netloc: featuredict["META-URL-%s" % netloc] = 1 featuredict["META-HASURL"] = 1 if note.attributes.latitude is not None: featuredict["META-HASLOCATION"] = 1 if note.attributes.source: featuredict["META-SOURCE-%s" % note.attributes.source] = 1 if note.attributes.placeName: featuredict["META-PLACE-%s" % note.attributes.placeName] = 1 if note.attributes.contentClass: featuredict["META-CONTENTCLASS-%s" % note.attributes.contentClass] = 1
def test_muliline(self): tokens = Tokeniser.split("hi there\nsecond line") self.assertEqual(tokens, ["hi", "there", "second", "line"])
def test_unicode(self): tokens = Tokeniser.split(u'hi theré') self.assertEqual(tokens, ["hi", u'theré'])
def test_punctuation(self): tokens = Tokeniser.split("hi there, you ...") self.assertEqual(tokens, ["hi", "there", ",", "you", "..."])
def test_currency(self): tokens = Tokeniser.split("hi there $100 man") self.assertEqual(tokens, ["hi", "there", "$100", "man"])