def add_sentiments(data, field='headline', boukinator_path=None, danew_path=None): """Add sentiments to data frame""" text = data[field] if boukinator_path: logging.info('adding Boukes et al') boukinator = Boukinator(boukinator_path) data['boukes'] = text.map(lambda t: boukinator.classify(t)['score']) logging.info('adding "recessie" classifier') # classify messages containing the word 'recessie' (EN: recession) as negative (-1) data['recessie'] = text.str.contains('recessie').map(float) * -1 def polygloter(t): try: return Text(t, hint_language_code='NL').polarity except ZeroDivisionError: logging.warning("Polgyglot failed: Divide by zero") return 0 logging.info('adding Polyglot') data['polyglot'] = text.map(polygloter) logging.info('adding Pattern') data['pattern'] = text.map(lambda t: sentiment(t)[0]) if danew_path: logging.info('adding DANEW') danew = DANEW(danew_path) data['DANEW'] = text.map(lambda t: danew.classify(t)['score'])
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(nl.sentiment("geweldig")[0] > 0) self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0) # Assert the accuracy of the sentiment analysis. # Given are the scores for 3,000 book reviews. # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: nl.positive(review), reviews) self.assertTrue(A > 0.80) self.assertTrue(P > 0.77) self.assertTrue(R > 0.85) self.assertTrue(F > 0.81) print "pattern.nl.sentiment()"
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(nl.sentiment("geweldig")[0] > 0) self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0) # Assert the accuracy of the sentiment analysis. # Given are the scores for 3,000 book reviews. # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: nl.positive(review), reviews) self.assertTrue(A > 0.80) self.assertTrue(P > 0.77) self.assertTrue(R > 0.85) self.assertTrue(F > 0.81) print "pattern.nl.sentiment()"
def FindOpinion(message, spellnl, nlnlp): tokenlist = wordtokenizer(message, nlnlp) tokenlist = [spellnl.correction(x) for x in tokenlist] message = ' '.join(tokenlist) sentimenttuple = sentiment(message) compoundsentiment = sentimenttuple[0] if compoundsentiment > 0.3: return 'Positive' elif compoundsentiment < -0.3: return 'Negative' else: return 'Neutral'
def repr_stats(st): #print u"Input object: %s" % st.text print u"Nr of paragraphs: %i" % st.p_count print u"Nr of whitespaces: %i" % st.nr_of_whitespaces print u"Nr of alphanumeric characters: %i" % st.nr_of_alpanumeric print u"Tokenized words: %s" % \ (u",".join(st.tokenized_text_extended[0:20]) + u'....') print u"Tokenized (clean)words: %s" % \ (u",".join(st.tokenized_text_short[0:20]) + u'....') print "Tokenized (nostop)words: %s" % \ (u",".join(st.tokenized_text_nostop[0:20]) + u'....') print u"Nr of total tokens: %i" % len(st.tokenized_text_extended) print u"Nr of total (clean)tokens: %i" % len(st.tokenized_text_short) print u"Nr of total (nostop)tokens: %i" % len(st.tokenized_text_nostop) from collections import Counter c = Counter(st.tokenized_text_nostop) print u"Top ten nostop tokens:" print u"".join( [u"\t" + str(i) + u" freq: %i\n" % j for i, j in c.most_common(10)]) print u"Usefull to 'useless' tokens ratio: %2.2f%%" % st.usefull_tokens_ratio print u"Avg word length (based on nostop): %2.2f" % st.avg_wordlen print u"Whitespace to text ratio: %2.2f%%" % st.whitespace_to_text_ratio print u"Upcase-starting words ratio (based on nostop): %2.2f%%" % st.upcase_ratio print u"Noise to text ratio: %2.2f%%" % st.noise_to_text_ratio print u"Nr of sentences: %i" % st.nr_of_sentences print u"Ratio nr of (nostop)tokens in dutch dict: %2.2f%%" % st.dict_check polarity_total = 0 subjectivity_total = 0 total_sentences = 0 for s in st.sentences: polarity, subjectivity = sentiment(s) if not polarity == 0: polarity_total += polarity subjectivity_total += subjectivity total_sentences += 1 if total_sentences > 0: polarity_total = polarity_total / (total_sentences) subjectivity_total = subjectivity_total / (total_sentences) print "Avg sentiment (polarity/subjectivity): %2.2f, %2.2f" % \ (polarity_total, subjectivity_total) print "Clusters (nostop)tokens:" from pprint import pprint pprint(cluster.get_clusters(st.sentences)) print "Ngrams:" from nltk.util import ngrams pprint(list(ngrams(st.tokenized_text_nostop, 3))[:10]) print "==========================================\n\n"
def add_sentiments(data, field): from polyglot.text import Text from pattern.nl import sentiment from scripts.DANEW import DANEW from scripts.boukinator import Boukinator try: from resources.sentistrength.senti_client import multisent except ImportError: logger.warning("Cannot import sentistrenght, skipping!") multisent = None def polygloter(t): try: return Text(t, hint_language_code='NL').polarity except: logger.exception("Polgyglot failed") return 0 if not '%s_polyglot' %field in data.columns: logger.info('adding Polyglot') data['%s_polyglot' %field] = data[field].map(polygloter) if not '%s_pattern' %field in data.columns: logger.info('adding Pattern') patterner = lambda t: sentiment(t)[0] data['%s_pattern' %field] = data[field].map(patterner) if not '%s_DANEW' %field in data.columns: logger.info('adding DANEW') danew = DANEW() danewer = lambda t: danew.classify(t)['score'] data['%s_DANEW' %field] = data[field].map(danewer) if not '%s_boukes' %field in data.columns: logger.info('adding Boukes et al') boukinator = Boukinator() boukinatorer = lambda t: boukinator.classify(t)['score'] data['%s_boukes' %field] = data[field].map(boukinatorer) if multisent and not '%s_sentistrength' %field in data.columns: logger.info('adding Sentistrength') sentistrength = multisent(language='NL') data['%s_sentistrength' %field] = [int(s['positive']) + int(s['negative']) for s in sentistrength.run_batch(data[field])] if not '%s_recessie' %field in data.columns: logger.info('adding "recessie" classifier') # classify messages containing the word 'recessie' (EN: recession) as negative (-1) data['%s_recessie' %field ] = data[field].str.contains('recessie').map(float)*-1 return data
def process(self, document_field, *args, **kwargs): '''Added sentiment based on Pattern''' try: language = kwargs['language'] except: raise Exception( "Specify a language, for example language='nl'. We support nl, en, fr, and it" ) if language == 'nl': try: from pattern.nl import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) elif language == 'en': try: from pattern.en import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) elif language == 'fr': try: from pattern.fr import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) elif language == 'it': try: from pattern.it import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) else: raise Exception( "Specify a language, for example language='nl'. We support nl, en, fr, and it" ) sent = sentiment(document_field) return ({'polarity': sent[0], 'subjectivity': sent[1]})
def process(self, document_field, *args, **kwargs): """Added sentiment based on Pattern""" try: language = kwargs["language"] except: raise Exception( "Specify a language, for example language='nl'. We support nl, en, fr, and it" ) if language == "nl": try: from pattern.nl import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) elif language == "en": try: from pattern.en import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) elif language == "fr": try: from pattern.fr import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) elif language == "it": try: from pattern.it import sentiment except: raise Exception( "Unavailable because you don't have the pattern library installed" ) else: raise Exception( "Specify a language, for example language='nl'. We support nl, en, fr, and it" ) sent = sentiment(document_field) return {"polarity": sent[0], "subjectivity": sent[1]}
cursor.execute(query) weeks = [] # Deze gaan we nu verwerken for (w, aanmeldingen, opmerking_array, titel_array) in cursor: pos, neg, neutraal = 0, 0, 0 if opmerking_array is not None: opmerkingen = opmerking_array.split(",") titels = titel_array.split(",") titels.extend(["", "", "", ""]) # there are some empty cells in the database for i in range(len(opmerkingen)): # we plakken de TITEL en OPMERKING aan elkaar vast en bepalen daar van het sentiment sentiment_analysis = sentiment(titels[i] + " " + opmerkingen[i])[0] if sentiment_analysis > 0.1: # sentiment is Positief pos += 1 elif sentiment_analysis < -0.1: # sentiment is Negatief neg += 1 else: neutraal += 1 # Het is ZEER moeilijk om het sentiment binnen de nederlandse taal te bepalen daarom heb ik neutraal ook toegevoegd als mogelijkheid else: opmerkingen = [] weeks.append([w, aanmeldingen, len(opmerkingen), pos, neg, neutraal]) # Tabel print("WEEK, \t AANME \t NIEUWS\t POS \t NEG \t NEUTRAAL") for week in weeks: print "{0} \t {1} \t {2} \t {3} \t {4} \t {5}".format(
def get_sentiment(text): sentiment = { "pattern" : sentiment(text)[0], "polyglot" : Text(text, hint_language_code='nl').polarity }
# Import the sentiment analyse module from the pattern module from pattern.nl import sentiment neutralText = "Het is een vrij rustige dag." positiveText = "Het product is mij zeer goed bevallen." # negativeText = "Het product voldeed niet aan mijn verwachtingen." # negativeText = "Je reageert niet op een manier die mij gelukkig stemt" negativeText = "Diepbedroeft over de zeer gebrekkige service van deze onervaren werknemer, het ontslag van deze persoon zou mij zeer vrolijk maken" # negativeText = "Ik ben niet te spreken over de service. Ik verwacht dan ook als compensatie mijn geld terug. Dit is stom" contradictingText = "Dit is een voorbeeld van een overwegend positieve text over een negatief verhaal" # Classify the text. The function returns 2 values. # sentiment(text) returns (polarity, subjectivity). sentimentAnalyse = sentiment(negativeText) print sentimentAnalyse # A text is quickly classified as negative. Because of that, we need to # use a threshold to classify the text as positive, negative, or neutral. # predictedSentiment = 'positive#' if sentimentAnalyse[0] > 0 else 'negative' predictedSentiment = 'neutral' if sentimentAnalyse[0] > 0.4: predictedSentiment = 'positive' elif sentimentAnalyse[0] < -0.2: predictedSentiment = 'negative' predictionSummary = "The provided text is classified as " + str( predictedSentiment) + ' which a score of ' + str(sentimentAnalyse[0])
def sentimentScore(text): return (sentiment(text)[0])
<<<<<<< HEAD party_sentiment = defaultdict(lambda : defaultdict(lambda : defaultdict(list))) ======= party_sentiment = defaultdict(lambda : defaultdict(list)) >>>>>>> e3934acbce5ba4326fddabba2ab9df8e3b56bcad infile = codecs.open(args.i,"r","utf-8") for line in infile.readlines(): tokens = line.strip().split("\t") <<<<<<< HEAD try: day = tokens[0] party = tokens[1] text = tokens[2] senti = sentiment(text) party_sentiment[party][day]["reg"].append(senti) if not re.search(' rt ',text,re.IGNORECASE) and not re.search('^rt',text,re.IGNORECASE): # print text party_sentiment[party][day]["rt"].append(senti) except IndexError: continue for party in party_sentiment.keys(): for day in sorted(party_sentiment[party].keys()): sentiments = party_sentiment[party][day]["reg"] sentiments_filt = party_sentiment[party][day]["rt"] # print len(sentiments),len(sentiments_filt) # print sentiments print party, day, round(sum([ float(i[0]) for i in sentiments ])/float(len(sentiments)),2), round(np.std(sentiments),2), round(sum([ float(i[0]) for i in sentiments_filt ])/float(len(sentiments_filt)),2), round(np.std(sentiments_filt),2)
from pattern.nl import sentiment print(sentiment('Een onwijs spannend goed boek!'))
def sentimentScore(text, debug=False): return (sentiment(text)[0])
def get_polarity(self): lemmas = [t.lemma for t in self.tokens.elements()] polarity, subjectivity = sentiment(" ".join(lemmas)) return polarity
def process_details(prod, params, force_refresh=False, cache_time=CACHE_TIME): tweets = cache(tweety.get_keyword, prod, force_refresh=force_refresh, cache_time=CACHE_TIME, **params) tweetList = [] unique_tweets = {} interaction_tweets = [] retweets = {} imagesList = [] URLList = [] word_cloud_dict = Counter() tsDict = Counter() mapLocations = [] spam_list = [] image_tweet_id = {} nodes = {} edges = [] for i in range(len(tweets)): tw = tweets[i] tweet = tw["tweet"] lemmas = [t["lemma"] for t in tw["tokens"]] texts = [t["text"].lower() for t in tw["tokens"]] # unlemmatized words words = list(set(lemmas + texts)) # to check for obscene words dt = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y") tsDict.update([(dt.year, dt.month, dt.day, dt.hour)]) tweets[i]["tweet"]["datetime"] = datetime( dt.year, dt.month, dt.day, dt.hour) # round to hour for peak detection # check for spam if any(obscene_words.get(t) for t in words): spam_list.append(tweet["id_str"]) continue tweetList.append(tweet["id_str"]) word_cloud_dict.update(lemmas) text = " ".join(texts) if text not in unique_tweets: unique_tweets[text] = tweet["id_str"] # track retweets and their retweet counts if "retweeted_status" in tweet: rt = tweet["retweeted_status"] id_str = rt["id_str"] retweet_count = rt["retweet_count"] if id_str not in retweets or retweet_count > retweets[id_str]: retweets[id_str] = retweet_count user_id_str = tweet["user"]["id_str"] if "retweeted_status" in tweet: rt_user_id_str = tweet["retweeted_status"]["user"]["id_str"] if rt_user_id_str not in nodes: nodes[rt_user_id_str] = tweet["retweeted_status"]["user"][ "screen_name"] if user_id_str not in nodes: nodes[user_id_str] = tweet["user"]["screen_name"] edges.append({ "source": rt_user_id_str, "target": user_id_str, "value": "retweet" }) if "user_mentions" in tweet["entities"]: if tweet["entities"]["user_mentions"]: interaction_tweets.append(tweet["id_str"]) for obj in tweet["entities"]["user_mentions"]: if obj["id_str"] not in nodes: nodes[obj["id_str"]] = obj["screen_name"] if user_id_str not in nodes: nodes[user_id_str] = tweet["user"]["screen_name"] edges.append({ "source": user_id_str, "target": obj["id_str"], "value": "mention" }) if tweet["in_reply_to_user_id_str"]: interaction_tweets.append(tweet["id_str"]) if tweet["in_reply_to_user_id_str"] not in nodes: nodes[tweet["in_reply_to_user_id_str"]] = tweet[ "in_reply_to_screen_name"] if user_id_str not in nodes: nodes[user_id_str] = tweet["user"]["screen_name"] edges.append({ "source": user_id_str, "target": tweet["in_reply_to_user_id_str"], "value": "reply" }) try: for obj in tweet["entities"]["media"]: image_url = obj["media_url_https"] image_tweet_id[image_url] = tweet["id_str"] imagesList.append(image_url) except KeyError: pass try: for obj in tweet["entities"]["urls"]: url = obj["expanded_url"] if url is not None: URLList.append(url) except KeyError: pass try: if tweet["coordinates"] is not None: if tweet["coordinates"]["type"] == "Point": coords = tweet["coordinates"]["coordinates"] mapLocations.append({"lng": coords[0], "lat": coords[1]}) except KeyError: pass mark_as_spam.apply_async((spam_list, ), queue="web") def is_stop_word(token): t = token.lower() return (len(t) <= 1) or (t.startswith("https://") or t.startswith("http://")) or (t in stop_words) word_cloud = [] for (token, count) in word_cloud_dict.most_common(): if not is_stop_word(token): word_cloud.append({"text": token, "count": count}) # sentiment analysis on wordcloud polarity, subjectivity = sentiment(" ".join(word_cloud_dict.elements())) ts = [] try: tsStart = sorted(tsDict)[0] tsEnd = sorted(tsDict)[-1] temp = datetime(tsStart[0], tsStart[1], tsStart[2], tsStart[3], 0, 0) while temp <= datetime(tsEnd[0], tsEnd[1], tsEnd[2], tsEnd[3], 0, 0): if (temp.year, temp.month, temp.day, temp.hour) in tsDict: ts.append({ "year": temp.year, "month": temp.month, "day": temp.day, "hour": temp.hour, "count": tsDict[(temp.year, temp.month, temp.day, temp.hour)] }) else: ts.append({ "year": temp.year, "month": temp.month, "day": temp.day, "hour": temp.hour, "count": 0 }) temp += timedelta(hours=1) except IndexError: # when there are 0 tweets pass # peak detection on time series y = np.array([t["count"] for t in ts]) peaks = peakutils.indexes(y, thres=0.6, min_dist=1).tolist( ) # returns a list with the indexes of the peaks in ts # peak explanation: the most used words in tweets in the peak # the peak indices are sorted in ascending order if peaks: peak_index = 0 new_peak = True peak_data = {} for tw in tweets: tweet = tw["tweet"] if new_peak: p = ts[peaks[peak_index]] dt = datetime(p["year"], p["month"], p["day"], p["hour"]) peak_data[peak_index] = Counter() new_peak = False if tweet["datetime"] < dt: continue elif tweet["datetime"] == dt: lemmas = [token["lemma"] for token in tw["tokens"]] peak_data[peak_index].update(lemmas) else: new_peak = True peak_index += 1 if peak_index == len(peaks): break peaks = [(p, ", ".join( islice( filter(lambda x: not is_stop_word(x), map(lambda x: x[0], peak_data[i].most_common())), 7))) for (i, p) in enumerate(peaks)] lng = 0 lat = 0 if mapLocations: for loc in mapLocations: lng += loc["lng"] lat += loc["lat"] avLoc = { "lng": lng / len(mapLocations), "lat": lat / len(mapLocations) } else: avLoc = {"lng": 5, "lat": 52} images = [] nsfw_list = [] for (url, count) in Counter(imagesList).most_common(): if len(images) >= 16: break nsfw_prob, status = get_nsfw_prob(url) if status == 200 and nsfw_prob > 0.8: nsfw_list.append(image_tweet_id[url]) elif status == 200: images.append({"link": url, "occ": count}) mark_as_spam.apply_async((nsfw_list, ), queue="web") urls = [] for (url, count) in Counter(URLList).most_common(): urls.append({"link": url, "occ": count}) # limit number of nodes/edges edges = random.sample(edges, min(len(edges), 250)) connected_nodes = set([e["source"] for e in edges] + [e["target"] for e in edges]) graph = {"nodes": [], "edges": []} for node in connected_nodes: graph["nodes"].append({"id": nodes[node]}) for edge in edges: source = edge["source"] target = edge["target"] graph["edges"].append({ "source": nodes[source], "target": nodes[target], "value": edge["value"] }) unique_ids = list(unique_tweets.values()) # retweet ids sorted from most to least tweeted if retweets: retweet_ids, _ = zip( *sorted(filter(lambda x: x[1] > 0, retweets.items()), key=lambda x: x[1], reverse=True)) else: retweet_ids = [] start = datetime.strptime(params["start"], time_format) end = datetime.strptime(params["end"], time_format) items = newsdb.find( { "keywords": prod, "pubdate": { "$gte": start, "$lt": end } }, projection={ "title": True, "pubdate": True, "description": True, "flag": True, "source": True, "link": True, "nid": True, "_id": False }) news = sorted([it for it in items], key=lambda x: x["pubdate"], reverse=True) data = { "tweets": unique_ids, "retweets": retweet_ids, "interaction_tweets": interaction_tweets, "num_tweets": len(tweetList), "timeSeries": ts, "peaks": peaks, "URLs": urls, "photos": images, "tagCloud": word_cloud, "locations": mapLocations, "centerloc": avLoc, "graph": graph, "news": news, "polarity": polarity } return data
def determine_sentiment(body): """ this funciton determines and returns the average sentiment of sentences in the provided input string. It uses the pattern module to do so. Sentiment values may range from -1 to 1. """ return np.mean([sentiment(sentence)[0] for sentence in sent_tokenize(body)])
def determine_subjectivity(body): """ This function determines and returns the average subjectivity of sentences in the provided input string. It uses the pattern module to do so. Subjectivity values may range from 0 to 1. """ return np.mean([sentiment(sentence)[1] for sentence in sent_tokenize(body)])
import mysql.connector from pattern.nl import sentiment xstr = lambda s: s or "" conn = mysql.connector.connect(user='******', password='******', host='____', database='____', port=12345) conn2 = mysql.connector.connect(user='******', password='******', host='____', database='____', port=12345) cursor = conn.cursor() cursor2 = conn2.cursor() query = ("SELECT id, titel, tekst FROM ponydb.nieuws;") cursor.execute(query) # laten we de klassen maken for (id, titel, tekst) in cursor: # sentiment(text) returns (polarity, subjectivity). sentimentAnalyse = sentiment(str(xstr(titel).encode('ascii', 'ignore')) +" "+ str(xstr(tekst).encode('ascii', 'ignore'))) # print [titel, tekst, sentimentAnalyse[0], sentimentAnalyse[1]] query2 = ("UPDATE nieuws SET sentiment={1}, subjectiviteit={2} WHERE id={0}".format(id, sentimentAnalyse[0], sentimentAnalyse[1])) cursor2.execute(query2) conn2.commit() # close connections cursor2.close(); conn2.close() cursor.close(); conn.close()
def insert_lexisnexis(pathwithlnfiles, recursive): """ Usage: insert_lexisnexis(pathwithlnfiles,recursive) pathwithlnfiles = path to a directory where lexis nexis output is stored recursive: TRUE = search recursively all subdirectories, but include only files ending on .txt FALSE = take ALL files from directory supplied, but do not include subdirectories """ tekst = {} byline = {} section = {} length = {} loaddate = {} language = {} pubtype = {} journal = {} journal2={} pubdate_day = {} pubdate_month = {} pubdate_year = {} pubdate_dayofweek = {} if recursive: alleinputbestanden = [] for path, subFolders, files in walk(pathwithlnfiles): for f in files: if isfile(join(path, f)) and splitext(f)[1].lower() == ".txt": alleinputbestanden.append(join(path, f)) else: # print listdir(pathwithlnfiles) alleinputbestanden = [join(pathwithlnfiles, f) for f in listdir(pathwithlnfiles) if isfile(join(pathwithlnfiles, f)) and splitext(f)[1].lower() == ".txt"] print alleinputbestanden artikel = 0 for bestand in alleinputbestanden: print "Now processing", bestand with open(bestand, "r", encoding="utf-8", errors="replace") as f: i = 0 for line in f: i = i + 1 # print "Regel",i,": ", line line = line.replace("\r", " ") if line == "\n": continue matchObj = re.match(r"\s+(\d+) of (\d+) DOCUMENTS", line) matchObj2 = re.match(r"\s+(\d{1,2}) (januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december) (\d{4}) (maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)", line) matchObj3 = re.match(r"\s+(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4})", line) matchObj4 = re.match(r"\s+(\d{1,2}) (January|February|March|April|May|June|July|August|September|October|November|December) (\d{4}) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)", line) if matchObj: artikel += 1 tekst[artikel] = "" while True: nextline=f.next() if nextline.strip()!="": journal2[artikel]=nextline.strip() break continue if line.startswith("BYLINE"): byline[artikel] = line.replace("BYLINE: ", "").rstrip("\n") elif line.startswith("SECTION"): section[artikel] = line.replace("SECTION: ", "").rstrip("\n") elif line.startswith("LENGTH"): length[artikel] = line.replace("LENGTH: ", "").rstrip("\n").rstrip(" woorden") elif line.startswith("LOAD-DATE"): loaddate[artikel] = line.replace("LOAD-DATE: ", "").rstrip("\n") elif matchObj2: # print matchObj2.string pubdate_day[artikel]=matchObj2.group(1) pubdate_month[artikel]=str(MAAND[matchObj2.group(2)]) pubdate_year[artikel]=matchObj2.group(3) pubdate_dayofweek[artikel]=matchObj2.group(4) elif matchObj3: pubdate_day[artikel]=matchObj3.group(2) pubdate_month[artikel]=str(MAAND[matchObj3.group(1)]) pubdate_year[artikel]=matchObj3.group(3) pubdate_dayofweek[artikel]="NA" elif matchObj4: pubdate_day[artikel]=matchObj4.group(1) pubdate_month[artikel]=str(MAAND[matchObj4.group(2)]) pubdate_year[artikel]=matchObj4.group(3) pubdate_dayofweek[artikel]=matchObj4.group(4) elif line.startswith("LANGUAGE"): language[artikel] = line.replace("LANGUAGE: ", "").rstrip("\n") elif line.startswith("PUBLICATION-TYPE"): pubtype[artikel] = line.replace("PUBLICATION-TYPE: ", "").rstrip("\n") elif line.startswith("JOURNAL-CODE"): journal[artikel] = line.replace("JOURNAL-CODE: ", "").rstrip("\n") elif line.lstrip().startswith("Copyright ") or line.lstrip().startswith("All Rights Reserved"): pass elif line.lstrip().startswith("AD/Algemeen Dagblad") or line.lstrip().startswith( "De Telegraaf") or line.lstrip().startswith("Trouw") or line.lstrip().startswith( "de Volkskrant") or line.lstrip().startswith("NRC Handelsblad") or line.lstrip().startswith( "Metro") or line.lstrip().startswith("Spits"): pass else: tekst[artikel] = tekst[artikel] + " " + line.rstrip("\n") print "Done!", artikel, "articles added." if not len(journal) == len(journal2) == len(loaddate) == len(section) == len(language) == len(byline) == len(length) == len(tekst) == len(pubdate_year) == len(pubdate_dayofweek) ==len(pubdate_day) ==len(pubdate_month): print "!!!!!!!!!!!!!!!!!!!!!!!!!" print "Ooooops! Not all articles seem to have data for each field. These are the numbers of fields that where correctly coded (and, of course, they should be equal to the number of articles, which they aren't in all cases." print "journal", len(journal) print "journal2", len(journal2) print "loaddate", len(loaddate) print "pubdate_day",len(pubdate_day) print "pubdate_month",len(pubdate_month) print "pubdate_year",len(pubdate_year) print "pubdate_dayofweek",len(pubdate_dayofweek) print "section", len(section) print "language", len(language) print "byline", len(byline) print "length", len(length) print "tekst", len(tekst) print "!!!!!!!!!!!!!!!!!!!!!!!!!" print print "Anyhow, we're gonna proceed and set those invalid fields to 'NA'. However, you should be aware of this when analyzing your data!" else: print "No missing values encountered." suspicious=0 for i in range(artikel): try: art_source = journal[i + 1] except: art_source = "NA" try: art_source2 = journal2[i + 1] except: art_source2 = "NA" try: art_loaddate = loaddate[i + 1] except: art_loaddate = "NA" try: art_pubdate_day = pubdate_day[i + 1] except: art_pubdate_day = "NA" try: art_pubdate_month = pubdate_month[i + 1] except: art_pubdate_month = "NA" try: art_pubdate_year = pubdate_year[i + 1] except: art_pubdate_year = "NA" try: art_pubdate_dayofweek = pubdate_dayofweek[i + 1] except: art_pubdate_dayofweek = "NA" try: art_section = section[i + 1] except: art_section = "NA" try: art_language = language[i + 1] except: art_language = "NA" try: art_length = length[i + 1] except: art_length = "NA" try: art_text = tekst[i + 1] except: art_text = "NA" try: tone=sentiment(art_text) art_polarity=str(tone[0]) art_subjectivity=str(tone[1]) except: art_polarity="NA" art_subjectivity="NA" try: art_byline = byline[i + 1] except: art_byline = "NA" # here, we are going to add an extra field for texts that probably are no "real" articles # first criterion: stock exchange notacions and similiar lists: ii=0 jj=0 for token in art_text.replace(",","").replace(".","").split(): ii+=1 if token.isdigit(): jj+=1 # if more than 25% of the tokens are numbers, then suspicious = True. art_suspicious = jj > .16 * ii if art_suspicious: suspicious+=1 art = {"source": art_source.lower(), 'source2': art_source2.lower(), "loaddate": art_loaddate, "pubdate_day":art_pubdate_day, "pubdate_month":art_pubdate_month, "pubdate_year":art_pubdate_year, "pubdate_dayofweek":art_pubdate_dayofweek, "section": art_section.lower(), "language": art_language.lower(), 'length': art_length, "text": art_text, "byline": art_byline, "from-database": "lexisnexis", "suspicious":art_suspicious,"polarity":art_polarity,"subjectivity":art_subjectivity} article_id = collection.insert(art) print '\nInserted',len(tekst),"articles, out of which",suspicious,"might not be real articles, but, e.g., lists of stock shares. "
# -*- coding: utf-8 -*- # Some of the strings that get scraped require utf-8 # pip install pattern from pattern.nl import sentiment neutralText = "Het is een vrij rustige dag." positiveText = "Het product is mij zeer goed bevallen." negativeText = "Diepbedroeft over de zeer gebrekkige service van deze onervaren werknemer." text = "De Indiase diplomate die het middelpunt vormde van een hoogoplopende ruzie tussen de VS en India, heeft de VS verlaten. De Amerikaanse autoriteiten vroegen Devyani Khobragade, plaatsvervangend consul in New York, te vertrekken nadat ze officieel in staat van beschuldiging was gesteld wegens visumfraude. Het Indiase ministerie van Buitenlandse Zaken maakte kort daarop bekend dat de diplomate wordt overgeplaatst naar het departement in New Delhi. OnderbetaaldEen rechtbank in New York oordeelde gisteren dat Khobragade valse informatie had gegeven bij de aanvraag van een visum voor haar Indiase huishoudelijke hulp. Ook zou ze hebben verdoezeld dat deze hulp minder betaald kreeg dan het Amerikaanse wettelijk minimumloon. Khobragade's arrestatie en tijdelijke opsluiting, vorige week, wekten grote verontwaardiging in India. Boze Indiƫrs eisten een boycot van Amerikaanse producten in India." # Classify the text. The function returns 2 values. # sentiment(text) returns (polarity, subjectivity). sentimentAnalyse = sentiment(" 10 Verschillen") print sentimentAnalyse # A text is quickly classified as negative. Because of that, we need to # use a threshold to classify the text as positive, negative, or neutral. # predictedSentiment = 'positive#' if sentimentAnalyse[0] > 0 else 'negative' predictedSentiment = 'neutral' if sentimentAnalyse[0] > 0.4: predictedSentiment = 'positive' elif sentimentAnalyse[0] < -0.2: predictedSentiment = 'negative' predictionSummary = "The provided text is classified as " + str(predictedSentiment) + ' which a score of ' + str(sentimentAnalyse[0]) print predictionSummary
def getFeatureValues(indexDictKeywords, indexDictWords, indexDictUser, indexDictDateTweet, indexDictDateEvent, maxList, perDict, indexDictTypes, otherFeatures, missing_value): finalList = [] featureList = maxList * [0] ## create list with zero's categories = ['Sport','Politiek','Uitzending','Publieksevenement','Software','Bijzondere dag','Sociale actie','Celebrity nieuws','Reclame','Overig'] total_db_list = [] anchors = buildAnchorHash() ## get the anchors for DBpedia extraction ## add the actual feature values for idx,line in enumerate(data): if idx % 100 == 0: print idx,'van de', len(data) splitLine = line.split('\t') ## check if we skip the event (annotated as non-event, only possible for labeled data) if labeled_data: if splitLine[8] == 'NA': skip_event = True else: skip_event = False else: splitLine.insert(0,'ID') ## unlabeled data does not have an ID, add something irrelevant skip_event = False if not skip_event: ## check if it actually was an event (and not a non-event) featureList = maxList * [0] ## get all event information dateEvent, eventScore, dateEventString, keywords, keywordsFixed, keywordScores, allTweets = get_event_information(splitLine, categories) ## add 1 (positive) at the right place in the feature-file using the dictionary featureList[0] = eventScore ## first feature is event score featureList = add_to_feature_dict(indexDictDateEvent, dateEventString, featureList) for keyword in keywordsFixed: keyword = unicode(keyword, 'utf-8') featureList = add_to_feature_dict(indexDictKeywords, keyword, featureList) allTweetsText = '' ## loop over all tweets to obtain the right values for the tweet-features for tweet in allTweets: splitTweet = tweet.strip().split(',') if len(splitTweet) > 2: user = splitTweet[0].strip() ## add user information dateTweetString = splitTweet[1].strip() if len(user) < 16 and '-' in dateTweetString: neededTweet = ",".join(splitTweet[2:]).split() finalTweet = [x.strip() for x in neededTweet if len(x) > 1 or x.isalpha() or x.isdigit()] ## delete everything that is only 1 character and non-letter/digit allTweetsText += ' ' + (" ".join(finalTweet)) keywordsInTweet = 0 for word in finalTweet: add_word = unicode(word, 'utf-8') featureList = add_to_feature_dict(indexDictWords, add_word, featureList) ## add information about the word (note: can be binary or total number of occurences, right now it is total) if word in keywordsFixed: keywordsInTweet += 1 featureList[1] += keywordsInTweet ## add how often a keyword occured featureList[2] += len(finalTweet) ## keep track of total number of words as a feature user = unicode(user, 'utf-8') featureList = add_to_feature_dict(indexDictUser, user, featureList) ## add username as same way as bag-of-words ## add date information to the featurelist featureList = add_to_feature_dict(indexDictDateTweet, dateTweetString, featureList) dateTweet = datetime.datetime.strptime(splitTweet[1].strip(),"%Y-%m-%d") beforeAfter, diff, diffTotal, absDiffTotal = getDateInformation(dateTweet, dateEvent) totalTweets = len(allTweets) featureList[3] = round(float(featureList[2]) / float(totalTweets),1) ## add average number of words per tweet featureList = addDateInformation(featureList, beforeAfter, diffTotal, absDiffTotal, totalTweets) ## add all information regarding dates senti = sentiment(allTweetsText) ## add sentiment and subjective information featureList[12] = senti[0] + 1 ## add +1 due to some classifiers not able to handle negative numbers (polarity) featureList[13] = senti[1] ## subjectivity featureList[14] = len(allTweets) ## add number of tweets per = getPeriodicityFeatures(keywordsFixed, keywordScores, dateEvent, perDict, missing_value) ## periodicity features for x in range(0, len(per)): ## add the features featureList[x+15] = per[x] featureList = getDbpediaFeatures(keywords, indexDictTypes, featureList, anchors) ## DBpedia features if labeled_data: featureList.append(int(splitLine[8]) -1) ## add the label as number finalList.append(featureList) ## keep track of the final featureList over all events return finalList