def parse_html(oid): cursor = db.attachments.find({'_id': ObjectId(oid)}) for attachment in cursor: html = fs.get(attachment['file']).read() # print html for link in BeautifulSoup(html, parseOnlyThese=SoupStrainer('a')): if link: # print link.contents dk = {} dk['url'] = link['href'] print "parsing " + link['href'] if ',' in link['tags']: tags = link['tags'].strip().split(',') if tags: dk['tags'] = tags dk['username'] = attachment['user'] if link.contents[0]: title = link.contents[0] print title if 'http' != title[:4]: dk['title'] = title if 'title' not in dk: dk['title'] = get_title_from_url(dk['url']) new_id = dks.insert(dk) if new_id: users.update({'username': attachment['user']}, { "$inc": { "dots": 1 }, "$set": { LAST_UPDATED: get_date() } }, upsert=False)
def parse_html(oid): cursor = db.attachments.find({'_id': ObjectId(oid)}) for attachment in cursor: html = fs.get(attachment['file']).read() # print html for link in BeautifulSoup(html, parseOnlyThese=SoupStrainer('a')): if link: # print link.contents dk = {} dk['url'] = link['href'] print "parsing " + link['href'] if ',' in link['tags']: tags = link['tags'].strip().split(',') if tags: dk['tags'] = tags dk['username'] = attachment['user'] if link.contents[0]: title = link.contents[0] print title if 'http' != title[:4]: dk['title'] = title if 'title' not in dk: dk['title'] = get_title_from_url(dk['url']) new_id = dks.insert(dk) if new_id: users.update({'username': attachment['user']}, {"$inc": {"dots": 1}, "$set": {LAST_UPDATED: get_date()}}, upsert=False)
def parse_html(oid): cursor = db.attachments.find({"_id": ObjectId(oid)}) for attachment in cursor: html = fs.get(attachment["file"]).read() # print html for link in BeautifulSoup(html, parseOnlyThese=SoupStrainer("a")): if link: # print link.contents dk = {} dk["url"] = link["href"] print "parsing " + link["href"] if "," in link["tags"]: tags = link["tags"].strip().split(",") if tags: dk["tags"] = tags dk["username"] = attachment["user"] if link.contents[0]: title = link.contents[0] print title if "http" != title[:4]: dk["title"] = title if "title" not in dk: dk["title"] = get_title_from_url(dk["url"]) new_id = dks.insert(dk) if new_id: users.update( {"username": attachment["user"]}, {"$inc": {"dots": 1}, "$set": {LAST_UPDATED: get_date()}}, upsert=False, )
def populate_dotmark(item): logger.info("processing %s" % item['url']) updates = {} if 'url' and '_id' in item: if 'title' not in item or not item['title']: updates['title'] = get_title_from_url(item['url']) item['title'] = updates['title'] atags = auto_tag(item) if atags: updates['atags'] = atags if updates: do_update(item['_id'], updates)