def getTags(self): tags = [] datum_box = DatumBox(API_KEY) #If PASSED_STRING is the name of a PDF document, fetch the text from the pdf document and set the #string to be evaluated to the text contents of the pdf. if self.PASSED_STRING[-4::]==".pdf": self.PASSED_STRING = pydfconversion.getPDFText(self.PASSED_STRING) #Set first tag to be one of 12 topics tags.append(datum_box.topic_classification(self.PASSED_STRING)) #Add "Educational" tag if educational and "Not Educational" tag if not educational if datum_box.is_educational(self.PASSED_STRING): tags.append("Educational") elif datum_box.is_educational(self.PASSED_STRING)!=True: tags.append("Not Educational") #Assign readability/difficulty of study to the third tag for each string tags.append(datum_box.readability_assessment(self.PASSED_STRING)) list = datum_box.keyword_extract(self.PASSED_STRING) for keyword in list: tags.append(keyword) #Remove 100 most commonly used words and individual letters from tag returns to help reduce clutter for n in tags: if n == ("time") or n == ("person") or n == ("year") or n == ("way") or n == ("day") or n == ("thing") or n == ("man")or n == ("world") or n == ("life") or n == ("hand") or n == ("part") or n == ("child") or n == ("eye") or n == ("woman") or n == ("place") or n == ("work") or n == ("week") or n == ("case") or n == ("point") or n == ("man") or n == ("company") or n == ("group")or n == ("problem") or n == ("fact") or n == ("be") or n == ("have") or n == ("do") or n == ("say") or n == ("get") or n == ("make") or n == ("go") or n == ("know") or n == ("take") or n == ("see") or n == ("come") or n == ("think") or n == ("look") or n == ("want") or n == ("give") or n == ("use") or n == ("find") or n == ("tell") or n == ("ask") or n == ("work") or n == ("seem") or n == ("feel") or n == ("try") or n == ("leave") or n == ("call") or n == ("good") or n == ("new") or n == ("first") or n == ("last") or n == ("long") or n == ("great") or n == ("little") or n == ("own") or n == ("other") or n == ("old") or n == ("right") or n == ("big") or n == ("high") or n == ("different") or n == ("small") or n == ("large") or n == ("next") or n == ("early") or n == ("young") or n == ("important") or n == ("few") or n == ("public") or n == ("bad")or n == ("same") or n == ("able") or n == ("to") or n == ("of") or n == ("in") or n == ("for") or n == ("on") or n == ("with") or n == ("at") or n == ("by") or n == ("from") or n == ("up") or n == ("about") or n == ("into") or n == ("over") or n == ("after") or n == ("beneath") or n == ("under") or n == ("above") or n == ("the") or n == ("and") or n == ("a") or n == ("that") or n == ("I") or n == ("it") or n == ("not") or n == ("he") or n == ("as") or n == ("you") or n == ("this") or n == ("but") or n == ("his") or n == ("they") or n == ("her") or n == ("she") or n == ("or") or n == ("an") or n == ("will") or n == ("my") or n == ("one") or n == ("all") or n == ("would") or n == ("there") or n == ("their") or n == ("is") or n == ("a") or n == ("b") or n == ("c") or n == ("d") or n == ("e") or n == ("f") or n == ("g") or n == ("h") or n == ("i") or n == ("j") or n == ("k") or n == ("l") or n == ("m") or n == ("n") or n == ("o") or n == ("p") or n == ("q") or n == ("r") or n == ("s") or n == ("t") or n == ("u") or n == ("v") or n == ("w") or n == ("x") or n == ("y") or n == ("z"): tags.remove(n) return tags
from DatumBox import DatumBox API_KEY = "2a13913dda346761765020c1f66e34f8" datum_box = DatumBox(API_KEY) print datum_box.keyword_extract("I hate my cat and love my dog")
API_KEY = "2a13913dda346761765020c1f66e34f8" import unittest from DatumBox import DatumBox, DatumBoxError positive_tweet = """Also, @xDaniielle has understood she can't beat me playing GTA. So instead, she's grabbing a copy too. I have an amazing girlfriend :D""" #The below tweet was not written by me, I would never say such a horrid thing. negative_tweet = """Gah! I hate programming. Been pissing me off all day. Time to go sit on the sofa in a huff with a beer :(""" #Begin tests datum_box = DatumBox(API_KEY) result = datum_box.twitter_sentiment_analysis(negative_tweet) print(result)
import requests from bs4 import BeautifulSoup from DatumBox import DatumBox API_KEY = "2a13913dda346761765020c1f66e34f8" datum_box = DatumBox(API_KEY) url="http://social.yourstory.com/2013/09/how-nit-warangal-lakshya-foundation-bridged-gap-alumni-and-students/" r=requests.get(url); soup=BeautifulSoup(r.content) g_data=soup.find_all("div",{"class":"ys_post_content text"}) s="" for item in g_data: s+=item.text print s x= datum_box.keyword_extract(s.encode('ascii','ignore')) keys=["NIT","RECW","WARANGAL"] for i in x: print i.encode('ascii','ignore') if i.encode('ascii','ignore').upper() in keys: print i.encode('ascii','ignore')
from DatumBox import DatumBox datum_box = DatumBox("2a13913dda346761765020c1f66e34f8") #import networkx as nx #import matplotlib.pyplot as plt import urllib2 import re import nltk, string from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords TAG_RE = re.compile(r"<[^>]+>") nltk.download('punkt') stemmer = nltk.stem.porter.PorterStemmer() remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) def fetch_page(siteURL): # create a variable which will hold our desired web page as a string site = siteURL # create the approprriate headers for our http request so that we wont run # into any 403 forbidden errors. All of this will be available at the tutorial # page that I will link to in the description below hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
import requests from bs4 import BeautifulSoup from DatumBox import DatumBox API_KEY = "2a13913dda346761765020c1f66e34f8" datum_box = DatumBox(API_KEY) url = "http://social.yourstory.com/2013/09/how-nit-warangal-lakshya-foundation-bridged-gap-alumni-and-students/" r = requests.get(url) soup = BeautifulSoup(r.content) g_data = soup.find_all("div", {"class": "ys_post_content text"}) s = "" for item in g_data: s += item.text print s x = datum_box.keyword_extract(s.encode('ascii', 'ignore')) keys = ["NIT", "RECW", "WARANGAL"] for i in x: print i.encode('ascii', 'ignore') if i.encode('ascii', 'ignore').upper() in keys: print i.encode('ascii', 'ignore')
# csvW=csv.writer(csvFile) # for item in outputFile: # csvW.writerow(item) # # csvFile.close() #using DatumBox API, passing in each article title and article content from the zipped list for sentiment analysis #positive, negative or neutral will be returned #sentiment for title and content is appended to alist from DatumBox import DatumBox datum_box = DatumBox('d3ce53ca1cead4e08490df097c890967') eTitle=[] eBody=[] for item in outputFile: title=item[1] content=item[3] eTitle.append(datum_box.sentiment_analysis(title)) eBody.append(datum_box.sentiment_analysis(content)) #zip together sentiment analyzed titles and body with the date emoted=zip(dates,urls, eTitle, eBody) e=open('sentimentOut.csv', 'wb') eWriter=csv.writer(e)
import sys import requests import re from DatumBox import DatumBox def stripTags(text): scripts = re.compile(r'<script.*?/script>') css = re.compile(r'<style.*?/style>') tags = re.compile(r'<.*?>') text = scripts.sub(' ', text) text = css.sub(' ', text) text = tags.sub(' ', text) return text request = requests.get(sys.argv[1]) content = request.text #content = '<script>sddd</script><style sdfsafs>sfsdfsfs</style>my text<be>betkk</be>' content = content.encode('utf-8') print content raw_content = stripTags(content) print raw_content db = DatumBox('9eb37f7399b4d074c5b83358f24ba626') res = db.twitter_sentiment_analysis(raw_content) print res
import requests import re from DatumBox import DatumBox def stripTags(text): scripts = re.compile(r'<script.*?/script>') css = re.compile(r'<style.*?/style>') tags = re.compile(r'<.*?>') text = scripts.sub(' ', text) text = css.sub(' ', text) text = tags.sub(' ', text) return text request = requests.get(sys.argv[1]) content = request.text #content = '<script>sddd</script><style sdfsafs>sfsdfsfs</style>my text<be>betkk</be>' content = content.encode('utf-8') print content raw_content = stripTags(content) print raw_content db = DatumBox('9eb37f7399b4d074c5b83358f24ba626') res = db.twitter_sentiment_analysis(raw_content) print res
</div> </div> <div id="about"> <h2> About Us</h2> <p> Something blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop</p> </div> </div> </body> </html> """ #Begin tests datum_box = DatumBox(API_KEY) bad_datum_box = DatumBox("This-API-key-is-not-valid-(hopefully)") class TestSentimentAnalysis(unittest.TestCase): def test_positive_review(self): self.assertEqual(datum_box.sentiment_analysis(positive_review), "positive") def test_negative_review(self): self.assertEqual(datum_box.sentiment_analysis(negative_review), "negative") def test_bad_api_key(self): self.assertRaises(DatumBoxError, bad_datum_box.sentiment_analysis, positive_review) class TestTwitterSentimentAnalysis(unittest.TestCase):
import urllib from bs4 import BeautifulSoup import csv import urllib2 import re import string from nltk.stem.snowball import SnowballStemmer from nltk.corpus import wordnet from itertools import chain from nltk.corpus import stopwords from DatumBox import DatumBox import nltk from sklearn.feature_extraction.text import TfidfVectorizer datum_box = DatumBox("2a13913dda346761765020c1f66e34f8") TAG_RE = re.compile(r"<[^>]+>") nltk.download('punkt') stemmer = nltk.stem.porter.PorterStemmer() remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) cachedStopWords = stopwords.words("english") def remove_tags(text): return TAG_RE.sub('', text) def stem_tokens(tokens): return [stemmer.stem(item) for item in tokens] '''remove punctuation, lowercase, stem'''
from DatumBox import DatumBox API_KEY = "454ec357b72e7d0c06cac8df90bb8862" datum_box = DatumBox(API_KEY) print datum_box.keyword_extract("HAL/S was designed not to include some constructs that are thought to be the cause of errors. For instance, there is no support for dynamic memory allocation. The language provides special support for real-time execution environments. Some features, such as \"GOTO\" were provided chiefly to ease mechanical translations from other languages.[3] On the Preface page of the HAL/S Language Specification,[4] it says, fundamental contributions to the concept and implementation of MAC were made by Dr. J. Halcombe Laning of the Draper Laboratory. \"HAL\" was suggested as the name of the new language by Ed Copps, a founding director of Intermetrics, to honor Hal Laning, a colleague at MIT. A proposal for a NASA standard ground-based version of HAL named HAL/G for \"ground\" was proposed, but the coming emergence of the soon to be named Ada programming language contributed to Intermetrics' lack of interest in continuing this work. Instead, Intermetrics would place emphasis on what would be the \"Red\" finalist which would not be selected.HAL/S is a mostly free-form language: statements may begin anywhere on a line and may spill over the next lines, and multiple statements may be fitted onto the same line if required. However, non-space characters in the first column of a program line may have special significance. For instance, the letter \'C\' in the first column indicates that the whole line is a comment and should be ignored by the compiler. One particularly interesting feature of HAL/S is that it supports, in addition to a normal single line text format, an optional three-line input format in which three source code lines are used for each statement. In this format, the first and third lines are usable for superscripts (exponents) and subscripts (indices). The multi-line format was designed to permit writing of HAL/S code that is similar to mathematical notation.")
#!/usr/bin/python import praw from DatumBox import DatumBox import time user_agent = "A continuous sentiment analysis of the front-page" r = praw.Reddit(user_agent=user_agent) r.login('LowLanding', 'Lespaul94', disable_warning=True) datum_box = DatumBox('c3ac3635add7e6d13ce0218ff033674f') # Array for submissions already analyzed already_done = [] while True: submissions = r.get_top() for submission in submissions: op_text = submission print "\n" print op_text print "Sentiment: {}".format(datum_box.sentiment_analysis(op_text)) print "Topic: {}".format(datum_box.topic_classification(op_text)) print "Sleeping" time.sleep(20)
from DatumBox import DatumBox datum_box = DatumBox("4e9f0a6e14a83e38d9d9fac895c84e68") #print datum_box.twitter_sentiment_analysis("I love my cat") print datum_box.topic_classification( "datumbox api wrapper text python code commit sign")
</div> </div> <div id="about"> <h2> About Us</h2> <p> Something blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop blah blah doo doo doo oop booop boopity boopoop</p> </div> </div> </body> </html> """ #Begin tests datum_box = DatumBox(API_KEY) bad_datum_box = DatumBox("This-API-key-is-not-valid-(hopefully)") datum_box.twitter_sentiment_analysis("I am the smartest man on earth!") ''' class TestSentimentAnalysis(unittest.TestCase): def test_positive_review(self): self.assertEqual(datum_box.sentiment_analysis(positive_review), "positive") def test_negative_review(self):
def get_topic_category(api_key, text): datum_box = DatumBox(api_key) return datum_box.topic_classification(text)