Ejemplo n.º 1
0
def getMeta(content, endPos, metas):
    start = content.find('<meta', endPos)
    if (start != -1):
        end = content.find('>', start + 1)
        methods.union(metas, [content[start:end + 1]])
        return getMeta(content, end + 1, metas)
    return metas
Ejemplo n.º 2
0
def getImages(page, domain):
    images = []
    while True:
        try:
            url, altText, endpos = getNextImage(page)
            if url:
                if methods.validURL(url):
                    tags = altText.split()
                    for w in tags:
                        w = methods.validWord(w.lower())

                    imgName = url.rsplit('/', 1)[-1]
                    imgType = imgName.rsplit('.', 1)[-1]
                    imgName = imgName.rsplit('.', 1)[0]

                    methods.union(tags, [imgName])

                    url = methods.formattedLinks([url], domain)[0]

                    temp = {}
                    temp['url'] = url
                    temp['name'] = imgName
                    temp['type'] = imgType

                    methods.union(images, [[temp, tags]])

                page = page[endpos + 1:]
            else:
                break
        except:
            break
    return images
Ejemplo n.º 3
0
def addToIndex(index, keyword, urlobj):
    if keyword in index:
        if urlobj['url'] in index[keyword]:
            methods.union(index[keyword][urlobj['url']]['position'],
                          urlobj['position'])
            index[keyword][urlobj['url']]['count'] = len(
                index[keyword][urlobj['url']]['position'])
        else:
            index[keyword][urlobj['url']] = urlobj
        return
    else:
        index[keyword] = {}
        index[keyword][urlobj['url']] = urlobj
Ejemplo n.º 4
0
def getLinks(page):
    links = []
    while True:
        try:
            url, endpos = getNextTarget(page)
            if url:
                if methods.validURL(url):
                    methods.union(links, [url])

                page = page[endpos + 1:]
            else:
                break
        except:
            break
    return links
Ejemplo n.º 5
0
def addListToIndex(index, keyword, url):
    if keyword in index:
        methods.union(index[keyword], [url])
        return
    else:
        index[keyword] = [url]