def getMeta(content, endPos, metas): start = content.find('<meta', endPos) if (start != -1): end = content.find('>', start + 1) methods.union(metas, [content[start:end + 1]]) return getMeta(content, end + 1, metas) return metas
def getImages(page, domain): images = [] while True: try: url, altText, endpos = getNextImage(page) if url: if methods.validURL(url): tags = altText.split() for w in tags: w = methods.validWord(w.lower()) imgName = url.rsplit('/', 1)[-1] imgType = imgName.rsplit('.', 1)[-1] imgName = imgName.rsplit('.', 1)[0] methods.union(tags, [imgName]) url = methods.formattedLinks([url], domain)[0] temp = {} temp['url'] = url temp['name'] = imgName temp['type'] = imgType methods.union(images, [[temp, tags]]) page = page[endpos + 1:] else: break except: break return images
def addToIndex(index, keyword, urlobj): if keyword in index: if urlobj['url'] in index[keyword]: methods.union(index[keyword][urlobj['url']]['position'], urlobj['position']) index[keyword][urlobj['url']]['count'] = len( index[keyword][urlobj['url']]['position']) else: index[keyword][urlobj['url']] = urlobj return else: index[keyword] = {} index[keyword][urlobj['url']] = urlobj
def getLinks(page): links = [] while True: try: url, endpos = getNextTarget(page) if url: if methods.validURL(url): methods.union(links, [url]) page = page[endpos + 1:] else: break except: break return links
def addListToIndex(index, keyword, url): if keyword in index: methods.union(index[keyword], [url]) return else: index[keyword] = [url]