def assert_locator(self, sa=None, href=None, opts=None): lctr = Locator.fetch((Locator.global_id==href,), _sa=sa, exists=False) if not lctr: if len(href) > 255: log.err("Reference too long: %s", href) return lctr = Locator( global_id=href, date_added=datetime.now() ) sa.add( lctr ) if opts.rsr_auto_commit: sa.commit() if opts.bm_ref_md5: self.add_lctr_ref_md5(opts, sa, href) yield dict(lctr=lctr)
def cmd_dlcs_import(opts, settings): """ TODO: built into generic import/export (ie. complete set) so heuristics can update all stats each import.. or find some way to fragment dataset. """ importFile = opts.args.FILE data = dlcs_parse_xml(open(importFile).read()) sa = Locator.get_session('default', opts.flags.dbref) #sa = model.get_session(opts.flags.dbref, metadata=SqlBase.metadata) tags_stat = {} domains_stat = {} # first pass: validate, track stats and create Locator records where missing for post in data['posts']: href = post['href'] dt = datetime.strptime(post['time'], ISO_8601_DATETIME) # validate URL url = urlparse(href) domain = url[1] if not domain: log.std("Ignored domainless (non-net?) URIRef: %s", href) continue assert re.match('[a-z0-9]+(\.[a-z0-9]+)*', domain), domain # get/init Locator lctr = Locator.fetch((Locator.ref == href,), exists=False) if lctr: if lctr.date_added != dt: lctr.date_added = dt sa.add(lctr) else: lctr = Locator( global_id=href, ref=href, date_added=datetime.strptime(post['time'], ISO_8601_DATETIME) ) lctr.init_defaults() log.std("new: %s", lctr) sa.add(lctr) # get/init Bookmark bm = Bookmark.fetch((Bookmark.ref_id == lctr.lctr_id,), exists=False) if bm: if bm.date_added != dt: bm.date_added = dt sa.add(bm) if bm.ref_id != lctr.lctr_id: bm.ref = lctr sa.add(bm) else: bm = Bookmark.fetch((Bookmark.name == post['description'],), exists=False) if bm: log.std("Name already exists: %r" % post['description']) continue bm = Bookmark( ref=lctr, name=post['description'], extended=post['extended'], tags=post['tag'].replace(' ', ', '), date_added=datetime.strptime(post['time'], ISO_8601_DATETIME) ) bm.init_defaults() log.std("new: %s", bm) sa.add(bm) # track domain frequency if domain in domains_stat: domains_stat[domain] += 1 else: domains_stat[domain] = 1 # track tag frequency for tag in post['tag'].split(' '): if tag in tags_stat: tags_stat[tag] += 1 else: tags_stat[tag] = 1 log.std("Checked %i locator references", len(data['posts'])) sa.commit() # Prepare domain stats avgDomainFreq = sum(domains_stat.values())/(len(domains_stat)*1.0) hiDomainFreq = max(domains_stat.values()) log.std("Found domain usage (max/avg): %i/%i", hiDomainFreq, avgDomainFreq) domains = 0 domainOffset = int(opts.flags.domain_offset) if domainOffset == 0: domainOffset = hiFreq elif domainOffset == -1: domainOffset = round(hiDomainFreq * 0.2) log.std("Setting domain-offset: %i", domainOffset) # get/init Domains for domain in domains_stat: freq = domains_stat[domain] if freq >= domainOffset: domains += 1 domain_record = Domain.fetch((Domain.name == domain,), exists=False) if not domain_record: domain_record = Domain(name=domain) domain_record.init_defaults() sa.add(domain_record) sa.commit() log.std("Checked %i domains", len(domains_stat)) log.std("Tracking %i domains", domains) # Prepare tag stats avgFreq = sum(tags_stat.values())/(len(tags_stat)*1.0) hiFreq = max(tags_stat.values()) log.std("Found tag usage (max/avg): %i/%i", hiFreq, avgFreq) tagOffset = int(opts.flags.tag_offset) if tagOffset == 0: tagOffset = hiFreq elif tagOffset == -1: tagOffset = round(hiFreq * 0.1) log.std("Setting tag-offset: %i", tagOffset) # get/init Tags tags = 0 for tag in tags_stat: freq = tags_stat[tag] if not re.match('[A-Za-z0-9-]+', tag): log.std("Non-std tag %s", tag) if freq >= tagOffset: tags += 1 t = Node.fetch((Node.name == tag,), exists=False) if not t: t = Tag(name=tag) t.init_defaults() log.std("new: %s", t) sa.add(t) # store frequencies # TODO tags_freq log.std("Checked %i tags", len(tags_stat)) log.std("Tracking %i tags", tags) sa.commit()