def generate_concordances_machinetags(concordances): machinetags = [] for c in concordances: mt = machinetag.machinetag(c) machinetags.extend(mt.magic_8s()) return machinetags
def generate_concordances_machinetags_hierarchy(concordances): hierarchies = [] for c in concordances: mt = machinetag.machinetag(c) c = "%s/%s/%s" % (mt.namespace(), mt.predicate(), mt.value()) hierarchies.append(c) return hierarchies
def import_links(options): solr = pysolr.Solr(options.solr) if options.purge: logging.info("purging all existing bookmarks...") solr.delete(q='*:*') fh = open(options.pinboard, 'r') data = json.load(fh) docs = [] for doc in data: tags = [] machinetags = [] machinetags_hierarchy = [] for t in doc['tags'].split(' '): tags.append(t) mt = machinetag.machinetag(t) if not mt.is_machinetag(): continue for chunk in mt.magic_8s(): if not chunk in machinetags: machinetags.append(chunk) hier = [ mt.namespace(), mt.predicate(), mt.value() ] hier = map(unicode, hier) hier = "/".join(hier) machinetags_hierarchy.append(hier) if len(tags): doc['tags'] = tags if len(machinetags): doc['machinetags'] = machinetags doc['machinetags_hierarchy'] = machinetags_hierarchy for key in ('shared', 'toread'): if doc[ key ] == 'yes': doc[ key ] = True else: doc[ key ] = False if doc['description'] == '': doc['description'] = doc['href'] parsed = urlparse.urlparse(doc['href']) hostname = parsed.hostname if hostname: if hostname.startswith("www."): hostname = hostname.replace("www.", "") doc['hostname'] = hostname docs.append(doc) if len(docs) == 1000: solr.add(docs) docs = [] if len(docs): solr.add(docs) logging.debug("import complete, optimizing...") solr.optimize()