Beispiel #1
0
def collocations(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    corpus = ''
    for page in j:
        wikitext = wikiParser(j[page]['content']).text
        bfSoup = ' '.join(BeautifulSoup(wikitext).findAll(text=True))
        corpus = corpus + " " + bfSoup
    tokens = nltk.wordpunct_tokenize(corpus)
    assert (tokens)
    finder = BigramCollocationFinder.from_words(tokens, window_size=20)
    finder.apply_freq_filter(4)
    ignored_words = nltk.corpus.stopwords.words('english')
    ignored_words.extend(htmltags)
    finder.apply_word_filter(
        lambda w: len(w) < 3 or w.lower() in ignored_words)
    a = finder.nbest(bigram_measures.likelihood_ratio, 500)
    final = []
    for k in a:
        if k in final or (k[1], k[0]) in final:
            continue
        final.append(k)
    return final
Beispiel #2
0
def main(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    for page in j:
        t = wikiParser(j[page]['content'])
        for header in t.headers:
            try:
                stemmedHeader = stemmer.stem(header)
            except Exception, e:
                print str(e)
                header = unidecode(header)
                stemmedHeader = stemmer.stem(header)
            if stemmedHeader in wordDict:
                wordDict[stemmedHeader]['count'] = 1
            else:
                wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader}
        text = t.text
        print type(text)
        tokens = [
            k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text)
            if re.match('[a-zA-Z]', k)
        ]
        words = [
            w.lower() for w in tokens if w.encode('utf-8').lower() not in
            nltk.corpus.stopwords.words(langMap[lang].lower())
        ]
        print len(words)
        for w in words:
            try:
                st = stemmer.stem(w)
            except Exception, e:
                st = stemmer.stem(unidecode(w))
                w = unidecode(w)
                continue
            if st in wordDict:
                wordDict[st]['count'] += 1
            else:
                wordDict[st] = {'count': 1, 'form': w}
Beispiel #3
0
def main(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    for page in j:
        t = wikiParser(j[page]['content'])
        for header in t.headers:
            try:
                stemmedHeader = stemmer.stem(header)
            except Exception, e:
                print str(e)
                header = unidecode(header)
                stemmedHeader = stemmer.stem(header)
            if stemmedHeader in wordDict:
                wordDict[stemmedHeader]['count'] = 1
            else:
                wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader}
        text = t.text
        print type(text)
        tokens = [k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text)
                  if re.match('[a-zA-Z]', k)]
        words = [w.lower() for w in tokens if w.encode('utf-8').lower() not in
                 nltk.corpus.stopwords.words(langMap[lang].lower())]
        print len(words)
        for w in words:
            try:
                st = stemmer.stem(w)
            except Exception, e:
                st = stemmer.stem(unidecode(w))
                w = unidecode(w)
                continue
            if st in wordDict:
                wordDict[st]['count'] += 1
            else:
                wordDict[st] = {'count': 1, 'form': w}
Beispiel #4
0
def collocations(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    corpus = ''
    for page in j:
        wikitext = wikiParser(j[page]['content']).text
        bfSoup = ' '.join(BeautifulSoup(wikitext).findAll(text=True))
        corpus = corpus + " " + bfSoup
    tokens = nltk.wordpunct_tokenize(corpus)
    assert(tokens)
    finder  = BigramCollocationFinder.from_words(tokens, window_size=20)
    finder.apply_freq_filter(4)
    ignored_words = nltk.corpus.stopwords.words('english')
    ignored_words.extend(htmltags)
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    a = finder.nbest(bigram_measures.likelihood_ratio, 500)
    final = []
    for k in a:
        if k in final or (k[1], k[0]) in final:
            continue
        final.append(k)
    return final