from bs4 import BeautifulSoup from __main__ import funcs, attrNames attrNames.append('hasStrong') def addStrongAttr(info, fileObject, index): soup = BeautifulSoup(fileObject) i = 0 for strong in soup.find_all('strong'): i = i+1 hasStrong = False if i: hasStrong = True info.append(hasStrong) return funcs.append(addStrongAttr)
from __main__ import funcs, attrNames import random import sys import os sys.path.append(os.path.dirname(__file__) + '/../data/') from getRawData import importantFileNum for i in range(50, 100): attrNames.append('noisy' + str(i)) attrNames.append('important') def addNoisyAndImportanceAttr(info, fileObj, index): important = False if index < importantFileNum: important = True for i in range(50, 100): if random.randint(1, 100) < i: info.append(important) else: info.append(not important) info.append(important) return funcs.append(addNoisyAndImportanceAttr)
from bs4 import BeautifulSoup from __main__ import funcs, attrNames attrNames.append('hasItalic') def addItalicAttr(info, fileObject, index): soup = BeautifulSoup(fileObject) i = 0 for it in soup.find_all('i'): i = i+1 hasItalic = False if i: hasItalic = True info.append(hasItalic) return funcs.append(addItalicAttr)
from bs4 import BeautifulSoup from __main__ import funcs, attrNames attrNames.append('hasBold') def addBoldAttr(info, fileObject, index): soup = BeautifulSoup(fileObject) i = 0 for bold in soup.find_all('b'): i = i+1 hasBold = False if i: hasBold = True info.append(hasBold) return funcs.append(addBoldAttr)
from bs4 import BeautifulSoup from __main__ import funcs, attrNames attrNames.append('hasLinks') attrNames.append('moreThan10Links') attrNames.append('moreThan100Links') attrNames.append('moreThan1000Links') def addLinkNumAttr(info, fileObject, index): soup = BeautifulSoup(fileObject) i = 0 for link in soup.find_all('a'): i = i+1 link = False link10 = False link100 = False link1000 = False if i > 0: link = True if i > 10: link10 = True if i > 100: link100 = True if i > 1000: link1000 = True info.append(link) info.append(link10)
from bs4 import BeautifulSoup from __main__ import funcs, attrNames attrNames.append('hasEmphasis') def addEmphasisAttr(info, fileObject, index): soup = BeautifulSoup(fileObject) i = 0 for em in soup.find_all('em'): i = i+1 hasEmphasis = False if i: hasEmphasis = True info.append(hasEmphasis) return funcs.append(addEmphasisAttr)
from bs4 import BeautifulSoup from __main__ import funcs, attrNames attrNames.append('hasPhotos') attrNames.append('moreThan10Photos') attrNames.append('moreThan100Photos') def addPhotoNumAttr(info, fileObject, index): soup = BeautifulSoup(fileObject) i = 0 for photo in soup.find_all('img'): i = i+1 photo = False photo10 = False photo100 = False if i > 0: photo = True if i > 10: photo10 = True if i > 100: photo100 = True info.append(photo) info.append(photo10) info.append(photo100) return
patterns = [] for i in range(0, titleMatchNum): pattern = '' while True: for i in range(0, 9): alphab = random.choice(alphabs) pattern = pattern + alphab if random.randint(0,1) == 0: break if pattern not in patterns: patterns.append(pattern) break for i in range(0, titleMatchNum): attrNames.append('titleMatchRegex:' + patterns[i]) def addTitleMatchAttr(info, fileObject, index): soup = BeautifulSoup(fileObject) title = '' if soup.title: title = soup.title.string if title: for i in range(0, titleMatchNum): m = re.search(patterns[i], title) if m: info.append(True) else: info.append(False) else: