Ejemplo n.º 1
0
def collocations(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    corpus = ''
    for page in j:
        wikitext = wikiParser(j[page]['content']).text
        bfSoup = ' '.join(BeautifulSoup(wikitext).findAll(text=True))
        corpus = corpus + " " + bfSoup
    tokens = nltk.wordpunct_tokenize(corpus)
    assert (tokens)
    finder = BigramCollocationFinder.from_words(tokens, window_size=20)
    finder.apply_freq_filter(4)
    ignored_words = nltk.corpus.stopwords.words('english')
    ignored_words.extend(htmltags)
    finder.apply_word_filter(
        lambda w: len(w) < 3 or w.lower() in ignored_words)
    a = finder.nbest(bigram_measures.likelihood_ratio, 500)
    final = []
    for k in a:
        if k in final or (k[1], k[0]) in final:
            continue
        final.append(k)
    return final
Ejemplo n.º 2
0
def main(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    for page in j:
        t = wikiParser(j[page]['content'])
        for header in t.headers:
            try:
                stemmedHeader = stemmer.stem(header)
            except Exception, e:
                print str(e)
                header = unidecode(header)
                stemmedHeader = stemmer.stem(header)
            if stemmedHeader in wordDict:
                wordDict[stemmedHeader]['count'] = 1
            else:
                wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader}
        text = t.text
        print type(text)
        tokens = [
            k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text)
            if re.match('[a-zA-Z]', k)
        ]
        words = [
            w.lower() for w in tokens if w.encode('utf-8').lower() not in
            nltk.corpus.stopwords.words(langMap[lang].lower())
        ]
        print len(words)
        for w in words:
            try:
                st = stemmer.stem(w)
            except Exception, e:
                st = stemmer.stem(unidecode(w))
                w = unidecode(w)
                continue
            if st in wordDict:
                wordDict[st]['count'] += 1
            else:
                wordDict[st] = {'count': 1, 'form': w}
Ejemplo n.º 3
0
def main(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    for page in j:
        t = wikiParser(j[page]['content'])
        for header in t.headers:
            try:
                stemmedHeader = stemmer.stem(header)
            except Exception, e:
                print str(e)
                header = unidecode(header)
                stemmedHeader = stemmer.stem(header)
            if stemmedHeader in wordDict:
                wordDict[stemmedHeader]['count'] = 1
            else:
                wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader}
        text = t.text
        print type(text)
        tokens = [k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text)
                  if re.match('[a-zA-Z]', k)]
        words = [w.lower() for w in tokens if w.encode('utf-8').lower() not in
                 nltk.corpus.stopwords.words(langMap[lang].lower())]
        print len(words)
        for w in words:
            try:
                st = stemmer.stem(w)
            except Exception, e:
                st = stemmer.stem(unidecode(w))
                w = unidecode(w)
                continue
            if st in wordDict:
                wordDict[st]['count'] += 1
            else:
                wordDict[st] = {'count': 1, 'form': w}
Ejemplo n.º 4
0
def collocations(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    corpus = ''
    for page in j:
        wikitext = wikiParser(j[page]['content']).text
        bfSoup = ' '.join(BeautifulSoup(wikitext).findAll(text=True))
        corpus = corpus + " " + bfSoup
    tokens = nltk.wordpunct_tokenize(corpus)
    assert(tokens)
    finder  = BigramCollocationFinder.from_words(tokens, window_size=20)
    finder.apply_freq_filter(4)
    ignored_words = nltk.corpus.stopwords.words('english')
    ignored_words.extend(htmltags)
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    a = finder.nbest(bigram_measures.likelihood_ratio, 500)
    final = []
    for k in a:
        if k in final or (k[1], k[0]) in final:
            continue
        final.append(k)
    return final
Ejemplo n.º 5
0
            out.write("%s,%s\n" % (k.replace('/', '_'), weeklyViews[k]))
        out.close()

        return weeklyViews

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print 'Usage: Wiki "query_string"'
        print 'Note: use "-" for negation. Example "cold - flu"'
        exit()
    query = sys.argv[1]
    lang = sys.argv[2]
    print query
    j = wikiApi.get_article(query, lang)
    print j.keys()
    a = wikiParser(j['Human flu']['content'])
    out = open('wikiMarkup.example', 'w')
    out.write(j["Human flu"]["content"].encode('utf-8'))
    out.close()
    exit()

    print "headers %s", a.headers
    raw_input()
    print "links %s", a.links
    raw_input()
    print "websites %s", a.websiteRef
    raw_input()
    print "sections %s", a.sections
    raw_input()
    print "links %s", a.links
    raw_input()
Ejemplo n.º 6
0
        out.close()

        return weeklyViews


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print 'Usage: Wiki "query_string"'
        print 'Note: use "-" for negation. Example "cold - flu"'
        exit()
    query = sys.argv[1]
    lang = sys.argv[2]
    print query
    j = wikiApi.get_article(query, lang)
    print j.keys()
    a = wikiParser(j['Human flu']['content'])
    out = open('wikiMarkup.example', 'w')
    out.write(j["Human flu"]["content"].encode('utf-8'))
    out.close()
    exit()

    print "headers %s", a.headers
    raw_input()
    print "links %s", a.links
    raw_input()
    print "websites %s", a.websiteRef
    raw_input()
    print "sections %s", a.sections
    raw_input()
    print "links %s", a.links
    raw_input()