예제 #1
0
def getWikiOutlinks(sourcewiki, html, outlinksDict):

	if( len(html) == 0 ):
		return outlinksDict

	try:
		soup = BeautifulSoup(html, 'html.parser')
		anchorTags = soup.find_all('a')
		
		for tag in anchorTags:
			
			if( tag.has_attr('href') == False ):
				continue

			if( tag['href'].find('wikipedia') == -1 ):
				continue

			link = tag['href']
			link = unquote(link)
			
			outlinksDict.setdefault(link, 0)
			outlinksDict[link] += 1
					
	except:
		genericErrorInfo()

	return outlinksDict
예제 #2
0
def searchKwordWindowsOpt(text, k, left, right, skipBothFlag=False):

    tokens = []
    try:
        tokens = getTokenizer(text)
    except:
        genericErrorInfo()

    counts = {'left': 0, 'both': 0}

    kWordWindows = []
    for i in range(len(tokens)):

        if (i % k == 0):
            kWordWindows.append([])

        kWordWindows[-1].append(tokens[i])

    counts['left'] = len(kWordWindows)

    if (skipBothFlag == False):
        for win in kWordWindows:
            if (left in win and right in win):
                counts['both'] += 1

    return counts
예제 #3
0
def getHTMLPaths():

	pathnames = []

	try:
		infile = open('wiki-small-html-files.txt', 'r')
		pathnames = infile.readlines()
		infile.close()
	except:
		genericErrorInfo()

	return pathnames
def getKwordWindowsOpt(text, k):

    tokens = []
    try:
        tokens = getTokenizer(text)
    except:
        genericErrorInfo()

    kWordWindows = []
    for i in range(len(tokens)):

        if (i % k == 0):
            kWordWindows.append([])

        kWordWindows[-1].append(tokens[i])

    return kWordWindows