Ejemplo n.º 1
0
def parse_html(oid):
    cursor = db.attachments.find({'_id': ObjectId(oid)})
    for attachment in cursor:
        html = fs.get(attachment['file']).read()
        # print html
        for link in BeautifulSoup(html, parseOnlyThese=SoupStrainer('a')):
            if link:
                # print link.contents
                dk = {}
                dk['url'] = link['href']
                print "parsing " + link['href']
                if ',' in link['tags']:
                    tags = link['tags'].strip().split(',')
                    if tags:
                        dk['tags'] = tags
                dk['username'] = attachment['user']
                if link.contents[0]:
                    title = link.contents[0]
                    print title
                    if 'http' != title[:4]:
                        dk['title'] = title
                if 'title' not in dk:
                    dk['title'] = get_title_from_url(dk['url'])
                new_id = dks.insert(dk)
                if new_id:
                    users.update({'username': attachment['user']}, {
                        "$inc": {
                            "dots": 1
                        },
                        "$set": {
                            LAST_UPDATED: get_date()
                        }
                    },
                                 upsert=False)
Ejemplo n.º 2
0
def parse_html(oid):
    cursor = db.attachments.find({'_id': ObjectId(oid)})
    for attachment in cursor:
        html = fs.get(attachment['file']).read()
        # print html
        for link in BeautifulSoup(html, parseOnlyThese=SoupStrainer('a')):
            if link:
                # print link.contents
                dk = {}
                dk['url'] = link['href']
                print "parsing " + link['href']
                if ',' in link['tags']:
                    tags = link['tags'].strip().split(',')
                    if tags:
                        dk['tags'] = tags
                dk['username'] = attachment['user']
                if link.contents[0]:
                    title = link.contents[0]
                    print title
                    if 'http' != title[:4]:
                        dk['title'] = title
                if 'title' not in dk:
                    dk['title'] = get_title_from_url(dk['url'])
                new_id = dks.insert(dk)
                if new_id:
                    users.update({'username': attachment['user']},
                                 {"$inc": {"dots": 1},
                                  "$set": {LAST_UPDATED: get_date()}},
                                 upsert=False)
Ejemplo n.º 3
0
def parse_html(oid):
    cursor = db.attachments.find({"_id": ObjectId(oid)})
    for attachment in cursor:
        html = fs.get(attachment["file"]).read()
        # print html
        for link in BeautifulSoup(html, parseOnlyThese=SoupStrainer("a")):
            if link:
                # print link.contents
                dk = {}
                dk["url"] = link["href"]
                print "parsing " + link["href"]
                if "," in link["tags"]:
                    tags = link["tags"].strip().split(",")
                    if tags:
                        dk["tags"] = tags
                dk["username"] = attachment["user"]
                if link.contents[0]:
                    title = link.contents[0]
                    print title
                    if "http" != title[:4]:
                        dk["title"] = title
                if "title" not in dk:
                    dk["title"] = get_title_from_url(dk["url"])
                new_id = dks.insert(dk)
                if new_id:
                    users.update(
                        {"username": attachment["user"]},
                        {"$inc": {"dots": 1}, "$set": {LAST_UPDATED: get_date()}},
                        upsert=False,
                    )
Ejemplo n.º 4
0
def populate_dotmark(item):
    logger.info("processing %s" % item['url'])
    updates = {}
    if 'url' and '_id' in item:
        if 'title' not in item or not item['title']:
            updates['title'] = get_title_from_url(item['url'])
            item['title'] = updates['title']

        atags = auto_tag(item)
        if atags:
            updates['atags'] = atags

        if updates:
            do_update(item['_id'], updates)
Ejemplo n.º 5
0
def populate_dotmark(item):
    logger.info("processing %s" % item['url'])
    updates = {}
    if 'url' and '_id' in item:
        if 'title' not in item or not item['title']:
            updates['title'] = get_title_from_url(item['url'])
            item['title'] = updates['title']

        atags = auto_tag(item)
        if atags:
            updates['atags'] = atags

        if updates:
            do_update(item['_id'], updates)