def getWikiOutlinks(sourcewiki, html, outlinksDict): if( len(html) == 0 ): return outlinksDict try: soup = BeautifulSoup(html, 'html.parser') anchorTags = soup.find_all('a') for tag in anchorTags: if( tag.has_attr('href') == False ): continue if( tag['href'].find('wikipedia') == -1 ): continue link = tag['href'] link = unquote(link) outlinksDict.setdefault(link, 0) outlinksDict[link] += 1 except: genericErrorInfo() return outlinksDict
def searchKwordWindowsOpt(text, k, left, right, skipBothFlag=False): tokens = [] try: tokens = getTokenizer(text) except: genericErrorInfo() counts = {'left': 0, 'both': 0} kWordWindows = [] for i in range(len(tokens)): if (i % k == 0): kWordWindows.append([]) kWordWindows[-1].append(tokens[i]) counts['left'] = len(kWordWindows) if (skipBothFlag == False): for win in kWordWindows: if (left in win and right in win): counts['both'] += 1 return counts
def getHTMLPaths(): pathnames = [] try: infile = open('wiki-small-html-files.txt', 'r') pathnames = infile.readlines() infile.close() except: genericErrorInfo() return pathnames
def getKwordWindowsOpt(text, k): tokens = [] try: tokens = getTokenizer(text) except: genericErrorInfo() kWordWindows = [] for i in range(len(tokens)): if (i % k == 0): kWordWindows.append([]) kWordWindows[-1].append(tokens[i]) return kWordWindows