re_matcher = re.compile("^https?://.*ics.uci.edu")


def get_links(html):
    links = []
    soup = BeautifulSoup(html, "html.parser")
    for link in soup.findAll('a', attrs={'href': re_matcher}):
        links.append(link.get('href'))
    return links


def hasdigit(token):
    return any(c.isdigit() for c in token)


stopw = Stopwords()


def check_token(token):
    return not stopw.is_stop(token) and not hasdigit(
        token) and len(token) > 1 and len(token) < 20


def add_token(token):
    pass


nonalphanum = re.compile("[^0-9a-z']")


def tokenize_text(intext):
Beispiel #2
0
from zope.component.testing import setUp

from index import Index
from parsers.english import EnglishParser
from splitter import SplitterFactory
from stopwords import Stopwords
from zopyx.txng3.core.interfaces import IParser, IStopwords, IThesaurus
from zopyx.txng3.core.lexicon import LexiconFactory
from zopyx.txng3.core.storage import StorageWithTermFrequencyFactory
from zopyx.txng3.core.thesaurus import GermanThesaurus

# Setup environment
setUp()
provideUtility(SplitterFactory, IFactory, 'txng.splitters.default')
provideUtility(EnglishParser(), IParser, 'txng.parsers.en')
provideUtility(Stopwords(), IStopwords, 'txng.stopwords')
provideUtility(LexiconFactory, IFactory, 'txng.lexicons.default')
provideUtility(StorageWithTermFrequencyFactory, IFactory,
               'txng.storages.default')
provideUtility(GermanThesaurus, IThesaurus, 'txng.thesaurus.de')

try:
    import readline
    histfile = os.path.expanduser('~/.pyhist')
    readline.read_history_file(histfile)
    atexit.register(readline.write_history_file, histfile)
except:
    pass


class Text: