def __guess_metadata(self, path): guess = [] try: keys = self.xtract.extract(path) for keyword_type, keyword in keys: ##~ print 'ktype:', keyword_type try: predicate = self.__extractor2nepomuk(str(keyword_type)) if predicate: ##~ print 'predicate:', predicate, if predicate == NIE['generator']: for ch in keyword: if not ch.isalpha(): break sep = keyword.find(ch) keyword = keyword[0:sep] #~ print 'keyword:', keyword, guess.append((predicate, Literal(str2utf8(keyword)))) else: ##~ print self.log.debug("guess_metadata: %s, %s" % (keyword_type, keyword)) except Exception, error: self.log.error(error) ##~ print except Exception, error: self.log.error(error)
def extract_from_string(self, content): data = [] try: title = content[0:100].strip() except Exception: title = "No title. Change it manually." data.append((RDF.type, NFO['Clipboard'])) data.append((NIE['title'], Literal(str2utf8(title)))) data.append((NIE['mimeType'], Literal('text/plain'))) data.append((NIE['mimeMedia'], Literal('text'))) data.append((NIE['mimeSubtype'], Literal('plain'))) #data.append((NFO['fileUrl'], Literal(str(id)[9:]))) return data
def __get_html_metadata(self, filename): data = [] self.log.info('Extracting metadata from: %s' % filename) log = open('/tmp/vlog.txt', 'w') try: keys = self.xtract.extract(filename) tagdict = {} for keyword_type, keyword in keys: if keyword_type == 'keywords': keyword = keyword.replace(' ', ',') t = 0 for label in keyword.split(','): t += 1 tag = label.split() tag = str(('').join(tag)) tag = tag.strip() tag = str2utf8(tag) if len(tag) < 3: continue tagid = self.__lookup_tag_id(tag) if not tagid: try: tagid = tagdict[tag] except: vres = self.app.gui.factory.new_resource() tagid = vres.get_id() tagdict[tag] = tagid else: self.log.info("Reusing tag: %s" % tag) data.append((NAO['hasTag'], URIRef(tagid))) lvres = [] for tag in tagdict: tagid = tagdict[tag] self.log.debug("New tag: %s" % tag) vres = self.app.gui.factory.new_resource(tagid, True) metadata = [] metadata.append((RDF.type, PIMO['Tag'])) metadata.append((NIE['title'], Literal(tag))) vres.set_label(tag) vres.set_type(PIMO['Tag']) vres.set_metadata(metadata) lvres.append(vres) """ rid = SYSRES['tag-all'] vres = self.app.gui.factory.new_resource(rid) metadata = [] metadata.append((RDF.type, PIMO['Tag'])) metadata.append((NIE['title'], Literal(_('[]')))) vres.set_label(_('all')) vres.set_type(PIMO['Tag']) vres.set_metadata(metadata) lvres.append(vres) """ self.app.gui.factory.transport(lvres) elif keyword_type == 'title': title = keyword.strip() title = title.replace('\n', '') title = title.replace('\t', '') title = str2utf8(title) data.append((NIE['title'], Literal(title))) elif keyword_type in ['creator', 'author', 'last saved by', 'artist']: # Creator of a data object, an entity primarily responsible #for the creation of the content of the data object. title = str2utf8(keyword.strip()) data.append((NCO['creator'], Literal(title))) elif keyword_type == 'format': #~ # PDF 1.4 pass elif keyword_type in ['subject', 'description', 'comment']: subject = str2utf8(keyword.strip()) data.append((NIE['subject'], Literal(subject))) elif keyword_type == 'language': # Language the InformationElement is expressed in. # This property applies to the data object in its entirety. # If the data object is divisible into parts expressed # in multiple languages - more specific properties should be used. # Users are encouraged to use the two-letter code specified in the # RFC 3066 lang = str2utf8(keyword.strip()) data.append((NIE['language'], Literal(lang))) elif keyword_type in ['generator', 'producer', 'software', 'publisher']: # Software used to "generate" the contents. E.g. a word processor name. generator = str2utf8(keyword.strip()) data.append((NIE['generator'], Literal(generator))) elif keyword_type == 'character count': # The amount of characters in the document. charc = str2utf8(keyword.strip()) data.append((NFO['characterCount'], Literal(charc))) elif keyword_type == 'line count': # The amount of lines in a text document linec = str2utf8(keyword.strip()) data.append((NFO['lineCount'], Literal(linec))) elif keyword_type == 'word count': # The amount of words in a text document wordc = str2utf8(keyword.strip()) data.append((NFO['wordCount'], Literal(wordc))) elif keyword_type == 'creation date': # The amount of words in a text document data.append((NIE['contentCreated'], Literal(str2utf8(keyword)))) elif keyword_type == 'mimetype': # The amount of words in a text document data.append((NIE['mimeType'], Literal(str2utf8(keyword)))) elif keyword_type == 'track number': data.append((NID3['trackNumber'], Literal(str2utf8(keyword)))) elif keyword_type == 'album': data.append((NID3['albumTitle'], Literal(str2utf8(keyword)))) elif keyword_type in ['genre', 'content type']: data.append((NID3['genre'], Literal(str2utf8(keyword)))) elif keyword_type == 'year': data.append((NID3['recordingYear'], Literal(str2utf8(keyword)))) elif keyword_type == 'disc number': data.append((NID3['discNumber'], Literal(str2utf8(keyword)))) elif keyword_type == 'camera model': data.append((NEXIF['cameraModel'], Literal(str2utf8(keyword)))) elif keyword_type == 'camera make': data.append((NEXIF['cameraMaker'], Literal(str2utf8(keyword)))) elif keyword_type == 'aperture': data.append((NEXIF['apertureValue'], Literal(str2utf8(keyword)))) elif keyword_type == 'exposure': data.append((NEXIF['exposureValue'], Literal(str2utf8(keyword)))) elif keyword_type == 'exposure bias': data.append((NEXIF['exposureBiasValue'], Literal(str2utf8(keyword)))) elif keyword_type == 'exposure mode': data.append((NEXIF['exposureMode'], Literal(str2utf8(keyword)))) elif keyword_type == 'iso speed': data.append((NEXIF['isoSpeed'], Literal(str2utf8(keyword)))) elif keyword_type == 'focal length': data.append((NEXIF['focalLength'], Literal(str2utf8(keyword)))) elif keyword_type == 'flash': data.append((NEXIF['flash'], Literal(str2utf8(keyword)))) elif keyword_type == 'metering mode': data.append((NEXIF['meteringMode'], Literal(str2utf8(keyword)))) elif keyword_type == 'orientation': data.append((NEXIF['orientation'], Literal(str2utf8(keyword)))) #~ elif keyword_type in ['format', 'resource-type']: #~ data.append((DC['format'], Literal(str2utf8(int(keyword))))) elif keyword_type == 'size': # The amount of words in a text document try: if 'x' in keyword: width = keyword[:keyword.find('x')] height = keyword[keyword.find('x') + 1:] data.append((NEXIF['width'], Literal(str2utf8(width)))) data.append((NEXIF['height'], Literal(str2utf8(height)))) except: pass else: log.write('KType: %s\t\t%s\t\t%s\n' % (keyword_type, keyword, filename)) log.close() # add new resources before return metadata self.app.store.do_commit() return data except Exception, error: self.log.error("get_html_metadata: %s" % error) return data
def extract_from_network(self, thing, vuri=None): data = [] if thing.startswith('www.'): thing = 'http://' + thing # 1. Get Url # RFC 1738: <scheme>://<user>:<password>@<host>:<port>/<url-path>;<params>?<query>#<fragment> parsed_url = urlparse(thing) url = parsed_url.geturl() self.log.debug("Got URL: %s" % url) # 2. Check if it's already in the kb link_id = self.app.gui.ask.get_link_id(url) #~ print url, link_id, type(link_id) if link_id: return [] #~ if link_id is not None: #~ self.log.debug("%s exists" % link_id) #~ title = self.app.vstore.get_property(link_id) #~ created = self.app.vstore.get_property(link_id, NAO['created']) #~ #FIXME: no dialogs in metadata. return error codes? #~ #~ print res #~ return [] #~ #~ print parsed_url.netloc, parsed_url.fragment # Uncomment the following lines to get more metadata about url status = self.__get_status_code(url) data.append((NFO['fileStatus'], Literal(str(status)))) data.append((NFO['fileScheme'], Literal(parsed_url.scheme))) if (parsed_url.port): data.append((NFO['filePort'], Literal(parsed_url.port))) if (parsed_url.fragment): data.append((NFO['fileFragment'], Literal(parsed_url.fragment))) if (parsed_url.netloc): data.append((NFO['fileNetloc'], Literal(parsed_url.netloc))) if (parsed_url.params): data.append((NFO['fileParams'], Literal(parsed_url.params))) if (parsed_url.path): data.append((NFO['filePath'], Literal(parsed_url.path))) if (parsed_url.query): data.append((NFO['fileQuery'], Literal(parsed_url.query))) #TODO: add more metadata from tags if it is webpage or feed # 3. Download url content and write it to temporal file self.log.info('Downloading %s' % url) try: output_file = LPATH['ROOT'] + tempfile.mktemp() fp = open(output_file, 'wb') c = pycurl.Curl() c.setopt(c.URL, url) c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.WRITEDATA, fp) c.perform() stsize = int(c.getinfo(c.SIZE_DOWNLOAD)) self.log.info("Document size: %d bytes" % stsize) c.close() # 4. Get Mimetype try: fmt = self.magic.file(output_file) data.append((NIE['mimeType'], Literal(fmt))) self.log.info(fmt) except Exception, error: self.log.error(error) if fmt == 'text/html': data.append((RDF.type, NFO['Website'])) data.append((NFO['fileUrl'], Literal(str2utf8(url)))) data.append((NFO['fileHostname'], Literal(str2utf8(parsed_url.hostname)))) print 'website', url, parsed_url.hostname try: data += self.__get_html_metadata(output_file) except Exception, error: self.log.error(error) self.log.info('Data: %s' % len(data)) hasTags = False hasTitle = False for p, o in data: if p == NAO['hasTag']: hasTags = True elif p == NIE['title']: hasTitle = True if not hasTags: data.append((NAO['hasTag'], SYSRES['no-tags'])) if not hasTitle: data.append((NIE['title'], Literal(str2utf8(url))))
hasTags = False hasTitle = False for p, o in data: if p == NAO['hasTag']: hasTags = True elif p == NIE['title']: hasTitle = True if not hasTags: data.append((NAO['hasTag'], SYSRES['no-tags'])) if not hasTitle: data.append((NIE['title'], Literal(str2utf8(url)))) elif fmt == 'application/xml': res = feedparser.parse(output_file) if res.bozo == 0: data.append((RDF.type, NFO['Feed'])) data.append((NFO['fileUrl'], Literal(str2utf8(url)))) data.append((NFO['fileHostname'], Literal(str2utf8(parsed_url.hostname)))) data.append((NIE['title'], Literal(str2utf8(res.feed.title)))) else: data.append((RDF.type, NFO['Website'])) data.append((NFO['fileUrl'], Literal(str2utf8(url)))) data.append((NFO['fileHostname'], Literal(str2utf8(parsed_url.hostname)))) data += self.__get_html_metadata(output_file) hasTags = False hasTitle = False for p, o in data: if p == NAO['hasTag']: hasTags = True elif p == NIE['title']: hasTitle = True if not hasTags: