コード例 #1
0
def terms(url):
    terms = {}
    html = requests.get(url)
    content = html.content.decode("utf-8")
    soup = BeautifulSoup(content)
    #print soup.get_text()
    '''
	for script in soup(['script','style']):
		script.extract

	text=soup.get_text().decode("utf-8")
	print(text)
	'''
    [
        s.extract() for s in soup(
            ['style', 'script', '[document]', 'head', 'title', 'select'])
    ]
    visible_text = soup.getText()
    #print soup.getText()

    print visible_text.decode

    f = open('haha4.txt', 'w')

    for i in visible_text:
        f.write(i.encode('utf-8'))
    f.close()

    tagger = tag.Tagger('english')
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    patt = "((?: [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF7][\x80-\xBF]{3}){1,100})"
    s = nltk.data.load('haha4.txt', format='raw').lower()
    re.sub(patt, '', s)
    extractor.tagger(s)
    # extract all the terms, even the "weak" ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract

    print extractor(s)
    result = []
    for ss in extractor(s):
        #print ss[0]
        for i in ss[0].split(" "):
            for j in i.split("-"):
                if not j in result:
                    result.append(j)

    print result

    with open("words.txt", "a") as myfile:
        for i in result:
            myfile.write(i + "\n")

    return result
コード例 #2
0
ファイル: nlp.py プロジェクト: Sandy4321/Word2Vec-Doc2Vec
def extract_keywords(doc, lower=False):
    extractor = extract.TermExtractor()
    extractor.filter = extract.DefaultFilter()

    keywords_list = []
    keywords = extractor(doc)

    for keyword in keywords:
        if lower == True:
            keywords_list.append(keyword[0].lower())
        else:
            keywords_list.append(keyword[0])

    return keywords_list
コード例 #3
0
ファイル: topia_test.py プロジェクト: souley/edison
def keyterms(text, language='english'):
    # initialize the tagger with the required language
    tagger = tag.Tagger(language)
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    #    s = nltk.data.load('corpora/operating/td1.txt',format = 'raw')
    extractor.tagger(text)
    # extract all the terms, even the "weak" ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract
    return extractor(text)
コード例 #4
0
    def POST(self):
        import sys
        import re
        import simplejson as json
        from topia.termextract import extract
        extractor = extract.TermExtractor()
        #extractor.filter = extract.permissiveFilter
        extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)

        def term_compare(x, y):
            if y[1] + y[2] * 2 > x[1] + x[2] * 2:
                return 1
            elif y[1] == x[1] and y[2] == x[2]:
                return 0
            else:  # x<y
                return -1

        input = web.input(callback=None)
        content = input.context.lower()
        content = content.replace(u"\u201c",
                                  '"').replace(u"\u201d", '"').replace(
                                      u"\u2018",
                                      "'").replace(u"\u2019",
                                                   "'").replace(u"\u2026", "")
        list = sorted(extractor(content), term_compare)
        list = list[:50]
        for i in range(len(list) - 1, -1, -1):
            if len(list[i][0]) == 1 or list[i][2] > 2 or (
                    list[i][0].find("http") >= 0) or not re.search(
                        '[a-z]', list[i][0]) or re.search('[0-9]', list[i][0]):
                list.remove(list[i])
            else:
                # prepend /tags/ to match expected input on server
                list[i] = list[i][0].strip()
        callback = input.callback
        pattern = r'[^a-zA-Z0-9 ]'
        for i in range(len(list) - 1, -1, -1):
            if re.search(pattern, list[i]):
                list.remove(list[i])
        if (len(sys.argv) > 2):
            length = int(sys.argv[2])
            if (len(list) > length):
                list = list[:length]
        list = json.dumps(list, indent=4)
        if callback and re.match('^[a-zA-Z0-9._\[\]]+$', callback):
            return callback + '(' + list + ')'
        else:
            return list
コード例 #5
0
def terms(url):
    terms = {}
    url = "http://www." + url
    html = requests.get(url)
    content = html.content.decode("utf-8")
    soup = BeautifulSoup(content, "lxml")
    '''
	for script in soup(['script','style']):
		script.extract

	text=soup.get_text().decode("utf-8")
	print(text)
	'''
    [
        s.extract()
        for s in soup(['style', 'script', '[document]', 'head', 'title'])
    ]
    visible_text = soup.getText()
    #print visible_text.decode
    f = open('haha4.txt', 'w')
    f2 = open('keys', 'a')
    for i in visible_text:
        f.write(i.encode('utf-8'))
        if not i in terms:
            terms[i] = 1
        else:
            terms[i] = terms[i] + 1
            #print "yees"
    pickle.dump(terms, f2)
    f2.close()
    f.close()

    tagger = tag.Tagger('english')
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    s = nltk.data.load('haha4.txt', format='raw')
    extractor.tagger(s)
    # extract all the terms, even the &amp;quot;weak&amp;quot; ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract
    #print extractor(s)
    return terms
コード例 #6
0
 def __init__(self):
     self.extractor = extract.TermExtractor()
     self.extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=3)
コード例 #7
0
ファイル: parser.py プロジェクト: emanuil-tolev/cl
def parser(path='',
           blurb='',
           scale=3,
           minoccur=2,
           omitscores=False,
           boostphrases=False,
           size=0,
           raw=False):
    size = int(request.values.get('size', size))
    minoccur = int(request.values.get('minoccur', minoccur))
    scale = int(request.values.get('scale', scale))
    if 'omitscores' in request.values and request.values['omitscores'].lower(
    ) not in ['false', '0', 'no']:
        omitscores = True
    if 'boostphrases' in request.values and request.values[
            'boostphrases'].lower() not in ['false', '0', 'no']:
        boostphrases = True

    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=minoccur)

    if path:
        url = path
    else:
        url = request.values.get('url', '')

    if url and not url.startswith('http://') and not url.startswith(
            'https://'):
        url = 'http://' + url

    if url:
        #try:
        # NOTE: GETTING THIS TO WORK PIP INSTALL SPYNNER WHICH REQUIRES
        # A BUNCH OF OTHER STUFF (I AM NOT SURE EXACTLY WHICH YET):
        # sudo apt-get install libx11 libx11-dev xvfb libxtst-dev libpng-dev
        # https://github.com/makinacorpus/spynner/#dependencies
        # still fails on importing PyQt4 which is on my system but can't
        # get it into my virtual env. Rely on non-js page content for now.
        #import spynner
        #browser = spynner.Browser()
        #browser.create_webview(True)
        #browser.load(url, load_timeout=60)
        #c = browser._get_html()
        #browser.close()
        #except:
        loc = app.config.get('BASE_URL', 'http://cottagelabs.com')
        if url.startswith(loc):
            lloc = url.replace(loc, '')
            if lloc == '': lloc = '/index'
            rec = models.Pages().pull_by_url(lloc)
            if rec is not None:
                c = rec.data.get('content', '')
            else:
                c = ''
        else:
            g = requests.get(url)
            c = g.text
    else:
        c = request.values.get('blurb', blurb)

    if c:
        content = _html_text(c)
        terms = extractor(content)
        result = {}
        for t, o, l in terms:
            if boostphrases:
                ct = o + l
            else:
                ct = o
            result[t.lower()] = ct * scale

        res = [{
            "term": i[0],
            "score": i[1]
        } for i in sorted(result.items(), key=lambda x: x[1], reverse=True)
               if len(i[0].replace(' ', '')) > 2 and i[0].replace(
                   ' ', '') not in url and i[0][0] in string.lowercase +
               string.uppercase and i[0][len(i[0]) - 1] in string.lowercase +
               string.uppercase]

        if omitscores: res = [i[0] for i in res]
        if size is not 0: res = res[:size]

    else:
        res = []

    if raw:
        return res
    else:
        resp = make_response(json.dumps(res))
        resp.mimetype = "application/json"
        return resp
コード例 #8
0
ファイル: huify.py プロジェクト: NetBUG/huifier
#coding=utf-8
from bs4 import BeautifulSoup
import html2text
import requests
from topia.termextract import extract
from topia.termextract import tag

tagger = tag.Tagger('english')
tagger.initialize()

# create the extractor with the tagger
extractor = extract.TermExtractor(tagger=tagger)
extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=3)
url = "http://habrahabr.ru"
doc = requests.get(url).content
doc = BeautifulSoup(doc, from_encoding="utf-8")
[s.extract() for s in doc(u'script')]
[s.extract() for s in doc(u'style')]
doc = doc.get_text()
#doc = html2text.html2text(doc)
kw = extractor(doc)
''' test: huify(u'эту неделю').encode('utf-8') == u'эту xyеделю' '''


def huify(expr):
    word = expr
    if expr.rfind(' ') > -1:
        word = expr[expr.rindex(' ') + 1:]
    vowels = set(u'aeiouyаеиоуыяюё')
    mz = len(word)
    for vowel in vowels:
コード例 #9
0
import os
'''
Spark test script
 
- How to run:
	spark-submit --master local[4] test.py
 
- contains examples to learn how to use the Spark Python API
 
'''
 
 
stop_words = get_stop_words('english')
stop_words.append("https ://t")
extractor = extract.TermExtractor()
extractor.filter = extract.DefaultFilter()
if __name__ == "__main__":
	# command line arguments?
	# Spark configuration
	conf = SparkConf().setAppName("Tweet Data by City")
	sc = CassandraSparkContext(conf=conf)
	sqlContext = SQLContext(sc)
 
	# read files
	rawTweets = sqlContext.read.json("../tweets.json")
	
	# split the data into two sets
	# rawTweets.geo is of type 'Column', which has the isNotNull() and isNull() functions to check for nullity
	geoTweets = rawTweets.filter(rawTweets.geo.isNotNull())
	placeTweets = rawTweets.filter(rawTweets.geo.isNull())
 
コード例 #10
0
def extract_keyword(text):
    extractor = extract.TermExtractor()
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2,
                                             noLimitStrength=65535)
    keywords = sorted(extractor(text))
    return keywords