def test(sites, keywords): sitesPaths = [os.path.join('nowe_filmy', 'test', site) for site in sites] siteTexts = [readSource(site) for site in sitesPaths] print '*' * 80 print 'TEST' filmDescriptions = {} newKeywords = [] for (i, text) in enumerate(siteTexts): nohtmlText = re.sub('<[^<>]*>', ' ', text) plainText = re.sub('\s+', ' ', nohtmlText) print 'search site:', sites[i] filmName = sites[i][:-4] # cut off .htm filmDescriptions[filmName] = {} foundKeywords = fillDescription(filmDescriptions[filmName], plainText, keywords + newKeywords) while foundKeywords != []: newKeywords += foundKeywords foundKeywords = fillDescription(filmDescriptions[filmName], plainText, keywords + newKeywords) return filmDescriptions
def retrieveFrom(name, fullDescr): content = readSource(name) soup = bs(content) keywords = getKeywords(fullDescr) usedKeywordPaths, keywordsBestPath = keywordsPath(soup, keywords) shift = getBestPathShift(fullDescr, keywordsBestPath, usedKeywordPaths) bestPath = keywordsBestPath for i in range(shift): bestPath.pop() return bestPath
def findObject(name, descr): '''Try to find object on source identified by name (url of file name) using description descr. If object can be found, returns tuple containing paths to object elements and html subtree with this object, otherwise returns None.''' content = readSource(name) soup = bs(content) try: elementsFullPaths, bestPath = findPath(soup, descr) cutSize = len(bestPath) - 1 subPaths = [path[cutSize:] for path in elementsFullPaths] except IndexError: return None return subPaths, soupFromPath(soup, bestPath)
def findPredecessor(sources, names): siteTexts = [readSource(source) for source in sources] predecessors = { 'unigram': {}, 'bigram': {}, 'trigram': {} } for (i, text) in enumerate(siteTexts): nohtmlText = re.sub('<[^<>]*>', ' ', text) plainText = re.sub('\s+', ' ', nohtmlText) print 'search site:', sources[i] for name in names: print 'search name:', name nameIndexes = findIndexes(plainText, name) for index in nameIndexes: partialText = plainText[:index] words = findThreeLastWords(predecessors, partialText) if words[0][-1] == ',': textWithOmitted = omitPrevious(partialText, 4) words = findThreeLastWords(predecessors, textWithOmitted) for word in words: for key in predecessors.iterkeys(): if word in predecessors[key]: predecessors[key][word] += 1 else: fillPredecessors(predecessors, words) bestName = chooseBestName(predecessors) showPredecessors(predecessors) return cleanName(bestName)
def elementInSoup(subSoup, element, usedPaths): print 'XXXXXXXXXXXXXXXXXXXXXX' print element subPath = getPath(subSoup) if element['keyVisible'] == True: findSoup = subSoup.find(text=re.compile(element['key'])) if findSoup is None: return None, False, usedPaths findPath = getPath(findSoup)[:len(subPath)] if matchPath(element['path'], findPath): return { 'key': element['key'], 'value': findSoup.text }, True, usedPaths + findPath else: print 'Sciezki sie nie zgadzaja' return None, False, usedPaths else: return ({'key': u'Reżyseria', 'value': 'Al Bundy'}, True, usedPaths) def visitRestTree(fullDescr, subSoup, usedPaths): return ({'key': u'Reżyseria', 'value': 'Al Bundy'}, subSoup, usedPaths) def retrieveFrom(name, fullDescr): content = readSource(name) soup = bs(content) keywords = getKeywords(fullDescr) usedKeywordPaths, keywordsBestPath = keywordsPath(soup, keywords) shift = getBestPathShift(fullDescr, keywordsBestPath, usedKeywordPaths) bestPath = keywordsBestPath for i in range(shift): bestPath.pop() return bestPath def getKeywords(fullDescr): keywords = [] for el in fullDescr: if el['keyVisible']: keywords.append(el['key']) return keywords def keywordsPath(soup, keywords): descrEl = [] for (i, keyword) in enumerate(keywords): descrEl.append({ 'key': '%d_____' % i, 'value': keyword }) elementPaths, bestPath = findPath(soup, descrEl) usedKeywordsPaths = {} for (i, el) in enumerate(descrEl): if elementPaths[i] != []: usedKeywordsPaths[keywords[i]] = elementPaths[i] return (usedKeywordsPaths, bestPath) def getBestPathShift(fullDescr, keywordsBestPath, keywordsPaths): keywords = keywordsPaths.keys() descrPaths = [descr['path'] for descr in fullDescr if descr['key'] in keywordsPaths] matchingPath = reduce(matchPath, descrPaths) print 'mp:', matchingPath cutPath = cutAllMatch(matchingPath) shift = len(cutPath) - 1 print 'Matching path shift =', shift return shift def matchPath(path1, path2, strict=False): minLength = min(len(path1), len(path2)) matchingPath = [] for i in range(minLength): p1 = path1[i] p2 = path2[i] nameCheck = check(p1[0], p2[0], strict) indCheck = check(p1[1], p2[1], strict) if not nameCheck[0] or not indCheck[0]: break matchingPath.append((nameCheck[1], indCheck[1])) return matchingPath def cutAllMatch(path): pathCopy = path[:] while len(pathCopy) > 0 and pathCopy[-1][1] == '_': pathCopy.pop() return pathCopy def check(x1, x2, strict=False): if strict: return (x1 == x2, x1) else: if x1 == '_' or x2 == '_': return (True, '_') else: return (x1 == x2, x1) if __name__ == '__main__': wstydPaths, wstydSubSoup = findObject('filmy\\wstyd.htm', wstydDes) wstydFullDescr = prepareDescription(wstydSubSoup, wstydDes, wstydPaths) #cubaPaths, cubaSubSoup = findObject('filmy\\cuba_isla_of_music.htm', cubaDes) #cubaFullDescr = prepareDescription(cubaSubSoup, cubaDes, cubaPaths) ostatniaPaths, ostatniaSubSoup = findObject('filmy\\ostatnia_milosc_na_ziemi.htm', ostatniaDes) ostatniaFullDescr = prepareDescription(ostatniaSubSoup, ostatniaDes, ostatniaPaths) descriptions = [wstydFullDescr, ostatniaFullDescr] mergedDescription = merge(descriptions) for d in mergedDescription: print '------' for k, v in d.iteritems(): print k, ':', v print '------' print '--------------' retrievedPath = retrieveFrom('filmy\\zapiski.htm', mergedDescription) print retrievedPath cnt = readSource('filmy\\zapiski.htm') soup = bs(cnt) subSoup = soupFromPath(soup, retrievedPath) print 'get data' print getData(subSoup, mergedDescription)
def getSoup(name): content = readSource(name) soup = bs(content) return soup