def scrape_news_text(news_url): global counter news_html = requests.get(news_url).content # print(news_html) '''convert html to BeautifulSoup object''' news_soup = BeautifulSoup(news_html, 'lxml') # soup.find("div", {"id": "articlebody"}) # paragraphs = [par.text for par in news_soup.find_all('p')] # news_text = '\n'.join(paragraphs) # print(news_soup.find("div", {"id": "articleText"})) date_object = news_soup.find(itemprop="datePublished") news_object = news_soup.find("div", {"id": "articleText"}) if date_object is None: return " " if news_object is None: return " " news_date = date_object.get_text( ) # find("div", {"id": "articleText"}).text news_text = news_object.text # print(news_date) # print(news_text) print(news_url) try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("nasdaq2.csv")) except: table = Datasheet() news_sentiment = sentiment(news_text) print(news_sentiment) table.append([counter, news_date, news_url, news_sentiment]) table.save(pd("nasdaq2.csv")) counter += 1 return news_text
def load_domains(self): sources_path = pd('data', 'source_data.csv') domain_file = Datasheet.load(sources_path, headers=True) for row in domain_file: url = row[1] cats = row[2:] self.cat_dict[url] = cats
def main(): logging.basicConfig(level=logging.INFO) argparser = ArgumentParser(description=__doc__) argparser.add_argument("-t", "--trainset", action="store", default=None, help=("Path to training data " "[default: %(default)s]")) argparser.add_argument("-m", "--model", action="store", help="Path to model") argparser.add_argument("-d", "--dump", action="store_true", help="Pickle trained model? [default: False]") argparser.add_argument("-v", "--verbose", action="store_true", default=False, help="Verbose [default: quiet]") argparser.add_argument("-c", "--classify", action="store", default=None, help=("Path to data to classify " "[default: %(default)s]")) argparser.add_argument("-s", "--save", action="store", default='output.csv', help=("Path to output file" "[default = output.csv]")) args = argparser.parse_args() clf = SensationalismClassifier(train_data=args.trainset, model=args.model, dump=args.dump, debug=args.verbose) if args.classify: OUTPUT_PATH = args.save if clf.debug: tick = time() to_classify = Datasheet.load(args.classify) classified_data = clf.classify(to_classify) output = Datasheet(classified_data) output.save(pd(OUTPUT_PATH)) if clf.debug: sys.stderr.write("\nProcessed %d items in %0.2fs" % (len(classified_data), time() - tick))
def parse(path): # 1) Parse the Excel sheet at the given path (xlsx()). # 2) Map the list of lists to list of dicts (assoc()). # 3) If a column contains splitable values (e.g., "1,2,3"), # 4) split the values in the column. rows = list(assoc(xlsx(pd(path)))) # 1 + 2 for k in rows[0].keys(): if splitable(col(rows, k)): # 3 for r in rows: r[k] = split(r[k]) # 4 return rows
def load_domains(self): """loads domain information""" sources_path = pd('data', 'source_data.csv') domain_file = Datasheet.load(sources_path, headers=True) for row in domain_file: url = row[1] if str(row[-1]).find("\""): cats = row[2:-1] else: cats = row[2:] self.cat_dict[url] = cats
def enrolTwitter(thread, location): # This function searches for the thread based on the specified location, and cleans the results # It also enrols the "cleaned" data into a CSV based file for analysis # setting marker (RT) for removal (improves sentient analysis results) remove_list = ['RT'] i = None # initalizing placeholder value to store Tweet IDs (ensure uniqueness) for j in range(2): # Count controls the number of streams returned at one time # RTs count as unique tweets, as long as the handler ID is unique print "Iteration Number", (j + 1) #for humans to understand lulz print for tweet in twitter.search(thread, geo=geocode(location)[:2], start=i, count=5, cached=False): # adds tweet if its ID doesn't exist previously if len(table) == 0 or tweet.id not in index: # a series of sentence level filters designed for Twitter handles # decomposes the tweet into words, and weeds for items in the remove_list dcomp_tweet = (tweet.text).split() # removes RTs from the tweet recombined_tweet = ' '.join( [k for k in dcomp_tweet if k not in remove_list]) #removes hashtag related content stringwithouthash = re.sub(r'#\w+ ?', '', recombined_tweet) #removes http/s related content stringwithouthttp = re.sub(r'http\S+', '', stringwithouthash) #removes @ related content finalstring = re.sub(r'@\w+ ?', '', stringwithouthttp) print finalstring # calls a function to analyze the string's sentiment value polarityVal, subjVal = checkSentiment(finalstring) # calls a simple function to analyze the string's certainty modalVal = checkModality(finalstring) table.append([ tweet.id, finalstring, location, polarityVal, subjVal, modalVal, thread ]) index.add(tweet.id) # Continues mining for older tweets (varied by j value) in the second iteration i = tweet.id # Commit saves to the parent directory of the Python file print print print print "Harvest complete - saving to:", os.getcwd() table.save(pd("analysis.csv"), headers=False) print print "Total unique entries in table:", len(table) print
def getTweetSecureLoad(self, topic): # This example retrieves tweets containing given keywords from Twitter. self.search_topic = topic print 'CLASS (Twitter_PatternPKG) - Twitter Secure Initial Load - Topic: ' + self.search_topic self.search_topic = topic + ' film' try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd(self.FILE_STORAGE)) # index = set(table.columns[0]) index = set(table.columns[4]) # on the text except: table = Datasheet() index = set() engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None #searchThisSubjects = search_topic # put headers table.append(["tweet_id", "tweet_date", "InputSubject", "Tweet_text"]) #for oneSubject in searchThisSubjects: oneSubject = self.search_topic # oneSubject tweet_list_Json = [] # list of JSons tweet_list = [] try: for i in range(1): for tweet in engine.search(oneSubject, start=prev, count=8, cached=False): if 'http' in tweet.text: posi = tweet.text.index('http') tweet.text = tweet.text[0:posi-1] # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.text not in index : table.append([tweet.id, tweet.date, oneSubject, tweet.text]) index.add(tweet.text) tweet_list.append([tweet.id, tweet.date, oneSubject, tweet.text]) #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject, tweet.text) #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) tweet.text = filter(lambda x: x in string.printable, tweet.text) # remove weird stuff tweet.text = tweet.text.replace('"', '') # remove weird stuff tweet.text = tweet.text.replace('\n', '') # remove weird stuff tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) # remove artificiall film tweet_list_Json.append(tweetJson) #print tweetJson # BUILD A JSON #http://stackoverflow.com/questions/14547916/how-can-i-loop-over-entries-in-json #BUILD A LIST OF DICTIONARIES #http://stackoverflow.com/questions/2733813/iterating-through-a-json-object # Continue mining older tweets in next iteration. prev = tweet.text except Exception: print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! - ([twitter_patternPkg_connector] getTweetSecureLoad)' print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! (film: ' + oneSubject +')' pass # Create a .csv in pattern/examples/01-web/ # table.save(pd("OD_CK1_Source4_Tweeter_InitialLoad.csv")) print "CLASS (Twitter_PatternPKG) - Total Secure Twitter Load: " + str(len(table)) + '\n' #print json.dumps(tweet_list) # return tweet_list return tweet_list_Json
# coding: utf-8 from pattern.web import Twitter from pattern.db import Database, SQLITE from pattern.db import pd from pattern.db import field, pk, INTEGER, UNIQUE, STRING from sqlite3 import IntegrityError team = ['#galo', '#Galo', '#Atletico-MG', '#atletico mineiro'] twitter = Twitter() db = Database(pd('tweets.db')) if not "tweets" in db: db.create("tweets", fields = (pk(), field('code', INTEGER, UNIQUE), field('text', STRING(140)))) #query in Twitter for hashtag in team: for tweet in twitter.search(hashtag): try: db.tweets.append(code = tweet.id, text = tweet.text) except IntegrityError: pass #Separate tweets in database for data in db.tweets.filter(): print data[2] print '-'*30
from pattern.web import URL from pattern.web import DOM from pattern.web import plaintext from pattern.db import Datasheet from pattern.db import pd feeds = { 'boorish': 'http://feeds.feedburner.com/daily-star-Real-Life', 'dramatic': 'http://feeds.feedburner.com/daily-star-Latest-News', 'geeky': 'http://feeds.feedburner.com/daily-star-Tech', 'dubious': 'http://feeds.feedburner.com/daily-star-Weird-News', 'vulgar': 'http://feeds.feedburner.com/daily-star-Love-Sex', } PATH = pd('..', 'data', 'news2.csv') # pd = parent directory of this script try: csv = Datasheet.load(PATH) seen = set(csv.columns[0]) except: csv = Datasheet() seen = set() for genre, url in feeds.items(): for r in Newsfeed().search(url, cached=False): if r.url not in seen: print r.title print try: src = URL(r.url).download(cached=True)
from pattern.en import sentiment # sentiment library, to acquire polarity and subjectivity (is it positive/negative) and if its objective/subjective (tells how much of an opinion it is) from pattern.en import modality # acquire modality (degree of certainty between fact or opinion) from pattern.web import cache from pattern.db import Datasheet, pprint, pd, SUM, AVG, STDEV, INTEGER, STRING import re # the regex library for processing sentences import datetime # library for system datetime import os, sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) import time #for sleep function # brand new - patch 1.0b - matplot display import numpy as np import matplotlib.pyplot as plt # Opens a CSV(comma-separated value) file for identifying and storing unique tweets try: table = Datasheet.load(pd("analysis.csv"), headers=False) index = set(table.columns[0]) # exceptions here except: table = Datasheet(fields=[("id", INTEGER), ( "content", STRING), ("location", STRING), ("polarity", STRING), ("subjectivity", STRING), ("modality", STRING)]) index = set() # Purged consumer keys and GPG-related content # Declare Twitter object, to search for precise stream-based information twitter = Twitter(license=None, throttle=0.5, language='en')
# The script produces "good-evil.csv", a dataset of 18,000+ tweets # of which we know that people are discussing a good or an evil character. # We can use it as training material to create a classifier that predicts # good or evil for tweets that mention unknown characters. # First we prepare the training data: import re URL = re.compile(r"https?://[^\s]+") # http://www.emrg.be REF = re.compile(r"@[a-z0-9_./]+", flags=re.I) # @tom_de_smedt from pattern.db import Datasheet, pd train = [] for name, alignment, tweet in Datasheet.load(pd("good-evil.csv")): tweet = URL.sub("http://", tweet) # Anonymize URLs. tweet = REF.sub("@friend", tweet) # Anonymize usernames. train.append((ngram_vector(tweet, 5), alignment)) # ------------------------------------------------------------------------------------ # Let's look at the statistical accuracy of the classifier: print kfoldcv(SVM, train, folds=3) print # This returns an (accuracy, precision, recall, F1-score, stdev)-tuple. # The F1-score is the most important. # An SVM trained on our data would be 94.6% accurate in knowing good from evil # (this is a suspiciously high accuracy).
from pattern.db import field, pk, STRING, INTEGER, DATE, NOW from pattern.db import assoc from pattern.db import rel from pattern.db import pd # pd() = parent directory of current script. # In this example, we'll build a mini-store: # with products, customers and orders. # We can combine the data from the three tables in an invoice query. # Create a new database. # Once it is created, you can use Database(name) to access it. # SQLite will create the database file in the current folder. # MySQL databases require a username and a password. # MySQL also requires that you install MySQLdb, see the installation instructions at: # http://www.clips.ua.ac.be/pages/pattern-db db = Database(pd("store.db"), type=SQLITE) #db._delete() # PRODUCTS # Create the products table if it doesn't exist yet. # An error will be raised if the table already exists. # Add sample data. if not "products" in db: # Note: in SQLite, the STRING type is mapped to TEXT (unlimited length). # In MySQL, the length matters. Smaller fields have faster lookup. schema = ( pk(), # Auto-incremental id. field("description", STRING(50)), field("price", INTEGER) ) db.create("products", schema)
import sys, termios, tty, os, time sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Twitter, hashtags from pattern.db import Datasheet, pprint, pd from pattern.en import sentiment, polarity, subjectivity, positive try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("tweets.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() engine = Twitter(language="en") prev = '1071765537749917696' counter = 0 while counter < 1000: counter += 1 time.sleep(60)
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Twitter, hashtags from pattern.db import Datasheet, pprint, pd import random # This example retrieves tweets containing given keywords from Twitter. try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("eulogy.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. search_term = 'beat' prev = None for i in range(2):
import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Twitter, hashtags from pattern.db import Datasheet, pprint, pd # This example retrieves tweets containing given keywords from Twitter. try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("black.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None for i in range(2): print(i) for tweet in engine.search("#blacklivesmatter",
def parse_rows(path): rows = list(assoc(xlsx(pd(path)))) # 1 + 2 #rows = xlsx(pd(path)) # 1 + 2 return rows
from pattern.db import field, pk, STRING, INTEGER, DATE, NOW from pattern.db import assoc from pattern.db import rel from pattern.db import pd # pd() = parent directory of current script. # In this example, we'll build a mini-store: # with products, customers and orders. # We can combine the data from the three tables in an invoice query. # Create a new database. # Once it is created, you can use Database(name) to access it. # SQLite will create the database file in the current folder. # MySQL databases require a username and a password. # MySQL also requires that you install MySQLdb, see the installation instructions at: # http://www.clips.ua.ac.be/pages/pattern-db db = Database(pd("store.db"), type=SQLITE) # db._delete() # PRODUCTS # Create the products table if it doesn't exist yet. # An error will be raised if the table already exists. # Add sample data. if not "products" in db: # Note: in SQLite, the STRING type is mapped to TEXT (unlimited length). # In MySQL, the length matters. Smaller fields have faster lookup. schema = ( pk(), # Auto-incremental id. field("description", STRING(50)), field("price", INTEGER)) db.create("products", schema) db.products.append(description="pizza", price=15)
# Put the file "SentiWordNet*.txt" in pattern/en/wordnet/ # You can then use Synset.weight() and wordnet.sentiwordnet: #from pattern.en import wordnet, ADJECTIVE #print wordnet.synsets("horrible", pos=ADJECTIVE)[0].weight # Yields a (polarity, subjectivity)-tuple. #print wordnet.sentiwordnet["horrible"] # For fine-grained analysis, # the return value of sentiment() has a special "assessments" property. # Each assessment is a (chunk, polarity, subjectivity, label)-tuple, # where chunk is a list of words (e.g., "not very good"). # The label offers additional meta-information. # For example, its value is MOOD for emoticons: try: table = Datasheet.load(pd("../../singleLife.csv")) index = set(table.columns[0]) except Exception as e: print e sys.exit() for i in range(len(table)): text = table[i][1] sent = sentiment(text) print sent[0], sent[1], text table[i].append(sent[0]) table[i].append(sent[1]) table.save(pd("cool.csv"))
w = Wiktionary(language="en") f = csv() # csv() is a short alias for Datasheet(). # Collect male and female given names from Wiktionary. # Store the data as (name, gender)-rows in a CSV-file. # The pd() function returns the parent directory of the current script, # so pd("given-names.csv") = pattern/examples/01-web/given-names.csv. for gender in ("male", "female"): for ch in ("abcdefghijklmnopqrstuvwxyz"): p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True) for name in p.links: if not name.startswith("Appendix:"): f.append((name, gender[0])) f.save(pd("given-names.csv")) print(ch, gender) # Create a classifier that predicts gender based on name. from pattern.vector import SVM, chngrams, count, kfoldcv class GenderByName(SVM): def train(self, name, gender=None): SVM.train(self, self.vector(name), gender) def classify(self, name): return SVM.classify(self, self.vector(name)) def vector(self, name):
import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Twitter, hashtags from pattern.db import Datasheet, pprint, pd # This example retrieves tweets containing given keywords from Twitter. try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("eulogy.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None for i in range(2): print(i) for tweet in engine.search("eulogy", start=prev, count=25, cached=False):
from pattern.web import Twitter # The pattern.db module has tools to work with data: # SQLite and MySQL databases, .csv files, date parsers, ... # The easiest way to store structured data is as a CSV # ("comma-separated values"), a plain text file where # each new line is a new row of data, and where columns # are separated by ",". # http://www.clips.ua.ac.be/pages/pattern-db#datasheet from pattern.db import Datasheet from pattern.db import pd # The pd() function means: # "there is a file search2-data.csv" in the same folder as this script". PATH = pd("tweets.csv") #print PATH try: # If a .csv file already exists, open that one and append new data to it. csv = Datasheet.load(PATH) seen = set(csv.columns[0]) except: # If a .csv file doesn't exist yet, create a new one. csv = Datasheet() seen = set() # The "seen" variable is a set (= list of unique values) # that contains the values in the first column in the CSV. # In other words, it contains the id's of the tweets. # We can use it to check if we have already seen a tweet,
prev = None print "processing word:",word for tweet in twitter.search(word,start=prev,cached=False,count=200): # print # # print tweet.text # print tweet.author # print tweet.date # print hashtags(tweet.text) # # print clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet.text).split()) if tweet.id not in index and clean_text not in texts: table.append([tweet.id,tweet.text,clean_text,hashtags(tweet.txt)]) index.add(tweet.id) texts.add(clean_text) prev = tweet.id # table.save(pd("tweets_threats.csv")) # pprint(table,truncate=100)
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Twitter, hashtags from pattern.db import Datasheet, pprint, pd # This example retrieves tweets containing given keywords from Twitter. try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any # given path. table = Datasheet.load(pd("cool.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None for i in range(2): print(i) for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
#(1, 'Radio Centraal') : 'https://redactie.radiocentraal.be/Home/feed/', #(1, 'Trouw') : 'https://www.trouw.nl/home/rss.xml', #('links', 'Marxisme.be') : 'https://nl.marxisme.be/marxisme-vandaag/feed/', #(1, 'Uitpers') : 'http://www.uitpers.be/feed/', #(1, 'Krapuul') : 'http://www.krapuul.nl/feed/', (-1, 'sceptr.net'): 'https://sceptr.net/feed/', (-1, 're-act.be'): 'http://www.krapuul.nl/feed/', (-1, 'eunmask.wordpress.com'): 'https://eunmask.wordpress.com/feed/', (-1, 'ejbron.wordpress.com'): 'https://ejbron.wordpress.com/feed/' } PATH = pd('news.csv') try: csv = Datasheet.load(PATH) seen = set(csv.columns[-2]) # use url as id except: csv = Datasheet() seen = set() for (label, name), url in sources.items(): try: f = Newsfeed() f = f.search(url, cached=False) except: continue
f = csv() # csv() is a short alias for Datasheet(). # Collect male and female given names from Wiktionary. # Store the data as (name, gender)-rows in a CSV-file. # The pd() function returns the parent directory of the current script, # so pd("given-names.csv") = pattern/examples/01-web/given-names.csv. for gender in ("male", "female"): for ch in ("abcdefghijklmnopqrstuvwxyz"): p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True) for name in p.links: if not name.startswith("Appendix:"): f.append((name, gender[0])) f.save(pd("given-names.csv")) print(ch, gender) # Create a classifier that predicts gender based on name. from pattern.vector import SVM, chngrams, count, kfoldcv class GenderByName(SVM): def train(self, name, gender=None): SVM.train(self, self.vector(name), gender) def classify(self, name): return SVM.classify(self, self.vector(name)) def vector(self, name):
a = set() # set ~= list of unique values t = parsetree(s) for sentence in t: for word in sentence.words: if word.tag and word.tag == "JJ": a.add(word.string.lower()) return list(sorted(a)) #print adjectives("I'm melting! Meeelting! What a wicked and cruel world!") # ------------------------------------------------------------------------------------ # See tweets.py csv = Datasheet() PATH = pd("properties.csv") try: csv = Datasheet.load(PATH) seen = set(csv.columns[0]) except: csv = Datasheet() seen = set() twitter = Twitter(language="en", license=None) for name in celebrities(): id = None for tweet in twitter.search(name, start=id, count=100, cached=False): id = tweet.id if id not in seen:
t = parsetree(s) for sentence in t: for word in sentence.words: if word.tag and word.tag == "JJ": a.add(word.string.lower()) return list(sorted(a)) #print adjectives("I'm melting! Meeelting! What a wicked and cruel world!") # ------------------------------------------------------------------------------------ # See tweets.py csv = Datasheet() PATH = pd("properties.csv") try: csv = Datasheet.load(PATH) seen = set(csv.columns[0]) except: csv = Datasheet() seen = set() twitter = Twitter(language="en", license=None) for name in celebrities(): id = None for tweet in twitter.search(name, start=id, count=100, cached=False): id = tweet.id if id not in seen:
'http://rssfeeds.usatoday.com/usatoday-NewsTopStories', (0, '', 'real', 'Financial Times'): 'http://www.ft.com/rss/world', (0, '', 'real', 'Associated Press'): 'http://hosted2.ap.org/atom/APDEFAULT/3d281c11a96b4ad082fe88aa0db04305', (0, '', 'real', 'The Diplomat'): 'http://thediplomat.com/feed/', (0, '', 'real', 'United Press International'): 'http://rss.upi.com/news/news.rss', (0, '', 'joke', 'The Onion'): 'http://www.theonion.com/feeds/rss', (4, 'right', 'joke', 'National Report'): 'http://feeds.feedburner.com/NationalReport', } PATH = pd('..', 'data', 'news1.csv') try: csv = Datasheet.load(PATH) seen = set(csv.columns[-2]) # use url as id except: csv = Datasheet() seen = set() for (level, bias, label, name), url in sources.items(): try: f = Newsfeed() f = f.search(url, cached=False) except: continue
# in which each item is a list of column values. # For example: # [["trope1", "movie1, movie2, ...", "description"], # ["trope2", "movie3, movie3, ...", "description"] # ] # The pd() function means "parent directory". # It points to the folder that contains the script you are looking at. # So, if you have a "data.csv" file in the same folder as this script, # you can reference it from this script with pd("data.csv"). tropes = {} # {trope1: [movie1, movie2, ...], ...} movies = {} # {movie1: [trope1, trope2, ...], ...} # Read each row in the .csv file. for trope, examples, description in Datasheet.load(pd("tropes.csv")): # The examples of movies that use this trope are separated by a newline (\n). # Split the string into a list: examples = examples.split("\n") # Add each new trope to the tropes dictionary. if not trope in tropes: tropes[trope] = set( ) # set() is like a list, but never contains duplicates. # Add each new movie to the movies dictionary. for movie in examples: if not movie in movies: movies[movie] = set() movies[movie].add(trope) tropes[trope].add(movie) print len(tropes), "tropes"
# This requires a personal license key. # If you are logged in to Facebook, you can get a license key here: # http://www.clips.ua.ac.be/pattern-facebook # (We don't / can't store your information). # 1) Searching for public status updates. # Search for all status updates that contain the word "horrible". try: # We'll store the status updates in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each status update. # We only want to add new status updates, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already # exists. table = Datasheet.load(pd("opinions.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() fb = Facebook() # With Facebook.search(cached=False), a "live" request is sent to Facebook: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. for status in fb.search("horrible", count=25, cached=False): print("=" * 100) print(status.id) print(status.text.encode("utf-8"))
# ------------------------------------------------------------------------------------ # This example demonstrates a semantic network of common sense. # A semantic network is a graph where nodes represent concepts # and edges (= connections between nodes) represent semantical # relations (e.g., "is-a", "is-part-of", "is-property-of", ...) # The data was collected manually and consists of about 10,000 # triples (concept1 -> relation -> concept2). # The visual tool for adding new triples is online at: # http://nodebox.net/perception # The data is bundled in Pattern as a .csv file. from pattern.graph import MODULE # path to pattern/graph/commonsense.csv data = pd(MODULE, "commonsense.csv") data = Datasheet.load(data) # Create the graph: g = Graph() for concept1, relation, concept2, context, weight in data: g.add_node(concept1) g.add_node(concept2) g.add_edge(concept1, concept2, type=relation, weight=min(int(weight) * 0.1, 1.0)) # ------------------------------------------------------------------------------------ # The halo of a node is a semantical representation of a concept. # The halo is made up of other concepts directly or indirectly related to this concept, # defining it.