Ejemplo n.º 1
0
 def getTags(self):
     tags = []
     datum_box = DatumBox(API_KEY)
     
     #If PASSED_STRING is the name of a PDF document, fetch the text from the pdf document and set the
     #string to be evaluated to the text contents of the pdf.
     if self.PASSED_STRING[-4::]==".pdf":
         self.PASSED_STRING = pydfconversion.getPDFText(self.PASSED_STRING)
     
     #Set first tag to be one of 12 topics
     tags.append(datum_box.topic_classification(self.PASSED_STRING))
     
     #Add "Educational" tag if educational and "Not Educational" tag if not educational
     if datum_box.is_educational(self.PASSED_STRING):
         tags.append("Educational")
     elif datum_box.is_educational(self.PASSED_STRING)!=True:
         tags.append("Not Educational")
     
     #Assign readability/difficulty of study to the third tag for each string
     tags.append(datum_box.readability_assessment(self.PASSED_STRING))
     list = datum_box.keyword_extract(self.PASSED_STRING)
     for keyword in list:
         tags.append(keyword)
     
     #Remove 100 most commonly used words and individual letters from tag returns to help reduce clutter
     for n in tags:
         if n == ("time") or n == ("person") or n == ("year") or n == ("way") or n == ("day") or n == ("thing") or n == ("man")or n == ("world") or n == ("life") or n == ("hand") or n == ("part") or n == ("child") or n == ("eye") or n == ("woman") or n == ("place") or n == ("work") or n == ("week") or n == ("case") or n == ("point") or n == ("man") or n == ("company") or n == ("group")or n == ("problem") or n == ("fact") or n == ("be") or n == ("have") or n == ("do") or n == ("say") or n == ("get") or n == ("make") or n == ("go") or n == ("know") or n == ("take") or n == ("see") or n == ("come") or n == ("think") or n == ("look") or n == ("want") or n == ("give") or n == ("use") or n == ("find") or n == ("tell") or n == ("ask") or n == ("work") or n == ("seem") or n == ("feel") or n == ("try") or n == ("leave") or n == ("call") or n == ("good") or n == ("new") or n == ("first") or n == ("last") or n == ("long") or n == ("great") or n == ("little") or n == ("own") or n == ("other") or n == ("old") or n == ("right") or n == ("big") or n == ("high") or n == ("different") or n == ("small") or n == ("large") or n == ("next") or n == ("early") or n == ("young") or n == ("important") or n == ("few") or n == ("public") or n == ("bad")or n == ("same") or n == ("able") or n == ("to") or n == ("of") or n == ("in") or n == ("for") or n == ("on") or n == ("with") or n == ("at") or n == ("by") or n == ("from") or n == ("up") or n == ("about") or n == ("into") or n == ("over") or n == ("after") or n == ("beneath") or n == ("under") or n == ("above") or n == ("the") or n == ("and") or n == ("a") or n == ("that") or n == ("I") or n == ("it") or n == ("not") or n == ("he") or n == ("as") or n == ("you") or n == ("this") or n == ("but") or n == ("his") or n == ("they") or n == ("her") or n == ("she") or n == ("or") or n == ("an") or n == ("will") or n == ("my") or n == ("one") or n == ("all") or n == ("would") or n == ("there") or n == ("their") or n == ("is") or n == ("a") or n == ("b") or n == ("c") or n == ("d") or n == ("e") or n == ("f") or n == ("g") or n == ("h") or n == ("i") or n == ("j") or n == ("k") or n == ("l") or n == ("m") or n == ("n") or n == ("o") or n == ("p") or n == ("q") or n == ("r") or n == ("s") or n == ("t") or n == ("u") or n == ("v") or n == ("w") or n == ("x") or n == ("y") or n == ("z"):
             tags.remove(n)
         return tags
Ejemplo n.º 2
0
from DatumBox import DatumBox
API_KEY = "2a13913dda346761765020c1f66e34f8"
datum_box = DatumBox(API_KEY)
print datum_box.keyword_extract("I hate my cat and love my dog")

Ejemplo n.º 3
0
API_KEY = "2a13913dda346761765020c1f66e34f8"

import unittest
from DatumBox import DatumBox, DatumBoxError


positive_tweet = """Also, @xDaniielle has understood she can't beat me playing GTA. So instead, she's grabbing a copy too. I have an amazing girlfriend :D"""

#The below tweet was not written by me, I would never say such a horrid thing.
negative_tweet = """Gah! I hate programming. Been pissing me off all day. Time to go sit on the sofa in a huff with a beer :("""

#Begin tests
datum_box = DatumBox(API_KEY)

result = datum_box.twitter_sentiment_analysis(negative_tweet)



print(result)
Ejemplo n.º 4
0
import requests
from bs4 import BeautifulSoup
from DatumBox import DatumBox
API_KEY = "2a13913dda346761765020c1f66e34f8"

datum_box = DatumBox(API_KEY)
url="http://social.yourstory.com/2013/09/how-nit-warangal-lakshya-foundation-bridged-gap-alumni-and-students/"
r=requests.get(url);
soup=BeautifulSoup(r.content)

g_data=soup.find_all("div",{"class":"ys_post_content text"})

s=""
for item in g_data:
	s+=item.text

print s
x= datum_box.keyword_extract(s.encode('ascii','ignore'))
keys=["NIT","RECW","WARANGAL"]
for i in x:
	print i.encode('ascii','ignore') 
	if i.encode('ascii','ignore').upper() in keys:
		print i.encode('ascii','ignore')
		


Ejemplo n.º 5
0
from DatumBox import DatumBox
datum_box = DatumBox("2a13913dda346761765020c1f66e34f8")
#import networkx as nx
#import matplotlib.pyplot as plt
import urllib2
import re
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
TAG_RE = re.compile(r"<[^>]+>")

nltk.download('punkt')
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)


def fetch_page(siteURL):
    # create a variable which will hold our desired web page as a string
    site = siteURL
    # create the approprriate headers for our http request so that we wont run
    # into any 403 forbidden errors. All of this will be available at the tutorial
    # page that I will link to in the description below
    hdr = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
Ejemplo n.º 6
0
import requests
from bs4 import BeautifulSoup
from DatumBox import DatumBox
API_KEY = "2a13913dda346761765020c1f66e34f8"

datum_box = DatumBox(API_KEY)
url = "http://social.yourstory.com/2013/09/how-nit-warangal-lakshya-foundation-bridged-gap-alumni-and-students/"
r = requests.get(url)
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class": "ys_post_content text"})

s = ""
for item in g_data:
    s += item.text

print s
x = datum_box.keyword_extract(s.encode('ascii', 'ignore'))
keys = ["NIT", "RECW", "WARANGAL"]
for i in x:
    print i.encode('ascii', 'ignore')
    if i.encode('ascii', 'ignore').upper() in keys:
        print i.encode('ascii', 'ignore')
Ejemplo n.º 7
0
# csvW=csv.writer(csvFile)
# for item in outputFile:
#     csvW.writerow(item)
#
# csvFile.close()





#using DatumBox API, passing in each article title and article content from the zipped list for sentiment analysis
#positive, negative or neutral will be returned
#sentiment for title and content is appended to alist

from DatumBox import DatumBox
datum_box = DatumBox('d3ce53ca1cead4e08490df097c890967')
eTitle=[]
eBody=[]
for item in outputFile:
    title=item[1]
    content=item[3]
    eTitle.append(datum_box.sentiment_analysis(title))
    eBody.append(datum_box.sentiment_analysis(content))




#zip together sentiment analyzed titles and body with the date
emoted=zip(dates,urls, eTitle, eBody)
e=open('sentimentOut.csv', 'wb')
eWriter=csv.writer(e)
Ejemplo n.º 8
0
import sys
import requests
import re

from DatumBox import DatumBox


def stripTags(text):
    scripts = re.compile(r'<script.*?/script>')
    css = re.compile(r'<style.*?/style>')
    tags = re.compile(r'<.*?>')

    text = scripts.sub(' ', text)
    text = css.sub(' ', text)
    text = tags.sub(' ', text)

    return text


request = requests.get(sys.argv[1])
content = request.text
#content = '<script>sddd</script><style sdfsafs>sfsdfsfs</style>my text<be>betkk</be>'
content = content.encode('utf-8')
print content
raw_content = stripTags(content)
print raw_content

db = DatumBox('9eb37f7399b4d074c5b83358f24ba626')
res = db.twitter_sentiment_analysis(raw_content)
print res
Ejemplo n.º 9
0
import requests
import re

from DatumBox import DatumBox

def stripTags(text):
  scripts = re.compile(r'<script.*?/script>')
  css = re.compile(r'<style.*?/style>')
  tags = re.compile(r'<.*?>')

  text = scripts.sub(' ', text)
  text = css.sub(' ', text)
  text = tags.sub(' ', text)

  return text


request = requests.get(sys.argv[1])
content = request.text
#content = '<script>sddd</script><style sdfsafs>sfsdfsfs</style>my text<be>betkk</be>'
content = content.encode('utf-8')
print content
raw_content = stripTags(content)
print raw_content

db = DatumBox('9eb37f7399b4d074c5b83358f24ba626')
res = db.twitter_sentiment_analysis(raw_content)
print res


Ejemplo n.º 10
0
            </div>
        </div>
        <div id="about">
            <h2> About Us</h2>
            <p> Something blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop</p>        
        </div>
    </div>

  </body>

</html>
"""


#Begin tests
datum_box = DatumBox(API_KEY)
bad_datum_box = DatumBox("This-API-key-is-not-valid-(hopefully)")
    

class TestSentimentAnalysis(unittest.TestCase):
   
    def test_positive_review(self):
        self.assertEqual(datum_box.sentiment_analysis(positive_review), "positive")
        
    def test_negative_review(self):
        self.assertEqual(datum_box.sentiment_analysis(negative_review), "negative")
        
    def test_bad_api_key(self):
        self.assertRaises(DatumBoxError, bad_datum_box.sentiment_analysis, positive_review)
        
class TestTwitterSentimentAnalysis(unittest.TestCase):
Ejemplo n.º 11
0
import urllib
from bs4 import BeautifulSoup
import csv
import urllib2
import re
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet
from itertools import chain
from nltk.corpus import stopwords
from DatumBox import DatumBox
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
datum_box = DatumBox("2a13913dda346761765020c1f66e34f8")
TAG_RE = re.compile(r"<[^>]+>")
nltk.download('punkt')
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
cachedStopWords = stopwords.words("english")


def remove_tags(text):
    return TAG_RE.sub('', text)


def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


'''remove punctuation, lowercase, stem'''
Ejemplo n.º 12
0
from DatumBox import DatumBox
API_KEY = "454ec357b72e7d0c06cac8df90bb8862"
datum_box = DatumBox(API_KEY)
print datum_box.keyword_extract("HAL/S was designed not to include some constructs that are thought to be the cause of errors. For instance, there is no support for dynamic memory allocation. The language provides special support for real-time execution environments. Some features, such as \"GOTO\" were provided chiefly to ease mechanical translations from other languages.[3] On the Preface page of the HAL/S Language Specification,[4] it says, fundamental contributions to the concept and implementation of MAC were made by Dr. J. Halcombe Laning of the Draper Laboratory. \"HAL\" was suggested as the name of the new language by Ed Copps, a founding director of Intermetrics, to honor Hal Laning, a colleague at MIT. A proposal for a NASA standard ground-based version of HAL named HAL/G for \"ground\" was proposed, but the coming emergence of the soon to be named Ada programming language contributed to Intermetrics' lack of interest in continuing this work. Instead, Intermetrics would place emphasis on what would be the \"Red\" finalist which would not be selected.HAL/S is a mostly free-form language: statements may begin anywhere on a line and may spill over the next lines, and multiple statements may be fitted onto the same line if required. However, non-space characters in the first column of a program line may have special significance. For instance, the letter \'C\' in the first column indicates that the whole line is a comment and should be ignored by the compiler. One particularly interesting feature of HAL/S is that it supports, in addition to a normal single line text format, an optional three-line input format in which three source code lines are used for each statement. In this format, the first and third lines are usable for superscripts (exponents) and subscripts (indices). The multi-line format was designed to permit writing of HAL/S code that is similar to mathematical notation.")
Ejemplo n.º 13
0
#!/usr/bin/python
import praw
from DatumBox import DatumBox
import time

user_agent = "A continuous sentiment analysis of the front-page"
r = praw.Reddit(user_agent=user_agent)
r.login('LowLanding', 'Lespaul94', disable_warning=True)

datum_box = DatumBox('c3ac3635add7e6d13ce0218ff033674f')

# Array for submissions already analyzed
already_done = []

while True:
    submissions = r.get_top()
    for submission in submissions:
        op_text = submission
        print "\n"
        print op_text
        print "Sentiment: {}".format(datum_box.sentiment_analysis(op_text))
        print "Topic: {}".format(datum_box.topic_classification(op_text))
    print "Sleeping"
    time.sleep(20)
Ejemplo n.º 14
0
from DatumBox import DatumBox
datum_box = DatumBox("4e9f0a6e14a83e38d9d9fac895c84e68")
#print datum_box.twitter_sentiment_analysis("I love my cat")
print datum_box.topic_classification(
    "datumbox api wrapper text python code commit sign")
Ejemplo n.º 15
0
            </div>
        </div>
        <div id="about">
            <h2> About Us</h2>
            <p> Something blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop</p>        
        </div>
    </div>

  </body>

</html>
"""


#Begin tests
datum_box = DatumBox(API_KEY)
bad_datum_box = DatumBox("This-API-key-is-not-valid-(hopefully)")
    

datum_box.twitter_sentiment_analysis("I am the smartest man on earth!")



'''

class TestSentimentAnalysis(unittest.TestCase):
   
    def test_positive_review(self):
        self.assertEqual(datum_box.sentiment_analysis(positive_review), "positive")
        
    def test_negative_review(self):
Ejemplo n.º 16
0
def get_topic_category(api_key, text):
	datum_box = DatumBox(api_key)
	return datum_box.topic_classification(text)