def getText(url): page = urllib.urlopen(url).read() page = unicode(page, "utf-8") text = extMainText(page).strip() # matches = re.findall("\s{2,}", text) # if len(matches) > 0: # string = max(matches, key = len) # index = text.find(string) # text = text[:index] return text
def getText(url): page = urllib.urlopen(url).read() page = unicode(page, "utf-8") text = extMainText(page).strip() return text