Exemple #1
0
def scrape_news_text(news_url):

    global counter

    news_html = requests.get(news_url).content

    #    print(news_html)
    '''convert html to BeautifulSoup object'''
    news_soup = BeautifulSoup(news_html, 'lxml')
    # soup.find("div", {"id": "articlebody"})
    #    paragraphs = [par.text for par in news_soup.find_all('p')]
    #    news_text = '\n'.join(paragraphs)

    #    print(news_soup.find("div", {"id": "articleText"}))

    date_object = news_soup.find(itemprop="datePublished")
    news_object = news_soup.find("div", {"id": "articleText"})

    if date_object is None:
        return "  "

    if news_object is None:
        return "   "

    news_date = date_object.get_text(
    )  #   find("div", {"id": "articleText"}).text
    news_text = news_object.text

    #    print(news_date)
    #    print(news_text)
    print(news_url)

    try:
        # We'll store tweets in a Datasheet.
        # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
        # In the first column, we'll store a unique id for each tweet.
        # We only want to add the latest tweets, i.e., those we haven't seen yet.
        # With an index on the first column we can quickly check if an id already exists.
        # The pd() function returns the parent directory of this script + any given path.
        table = Datasheet.load(pd("nasdaq2.csv"))
    except:
        table = Datasheet()

    news_sentiment = sentiment(news_text)

    print(news_sentiment)

    table.append([counter, news_date, news_url, news_sentiment])

    table.save(pd("nasdaq2.csv"))

    counter += 1

    return news_text
 def load_domains(self):
     sources_path = pd('data', 'source_data.csv')
     domain_file = Datasheet.load(sources_path, headers=True)
     for row in domain_file:
         url = row[1]
         cats = row[2:]
         self.cat_dict[url] = cats
def main():
    logging.basicConfig(level=logging.INFO)

    argparser = ArgumentParser(description=__doc__)
    argparser.add_argument("-t",
                           "--trainset",
                           action="store",
                           default=None,
                           help=("Path to training data "
                                 "[default: %(default)s]"))
    argparser.add_argument("-m",
                           "--model",
                           action="store",
                           help="Path to model")
    argparser.add_argument("-d",
                           "--dump",
                           action="store_true",
                           help="Pickle trained model? [default: False]")
    argparser.add_argument("-v",
                           "--verbose",
                           action="store_true",
                           default=False,
                           help="Verbose [default: quiet]")
    argparser.add_argument("-c",
                           "--classify",
                           action="store",
                           default=None,
                           help=("Path to data to classify "
                                 "[default: %(default)s]"))
    argparser.add_argument("-s",
                           "--save",
                           action="store",
                           default='output.csv',
                           help=("Path to output file"
                                 "[default = output.csv]"))
    args = argparser.parse_args()

    clf = SensationalismClassifier(train_data=args.trainset,
                                   model=args.model,
                                   dump=args.dump,
                                   debug=args.verbose)

    if args.classify:
        OUTPUT_PATH = args.save

        if clf.debug:
            tick = time()
        to_classify = Datasheet.load(args.classify)
        classified_data = clf.classify(to_classify)
        output = Datasheet(classified_data)
        output.save(pd(OUTPUT_PATH))

        if clf.debug:
            sys.stderr.write("\nProcessed %d items in %0.2fs" %
                             (len(classified_data), time() - tick))
Exemple #4
0
def parse(path):
    # 1) Parse the Excel sheet at the given path (xlsx()).
    # 2) Map the list of lists to list of dicts (assoc()).
    # 3) If a column contains splitable values (e.g., "1,2,3"),
    # 4) split the values in the column.
    rows = list(assoc(xlsx(pd(path))))  # 1 + 2
    for k in rows[0].keys():
        if splitable(col(rows, k)):  # 3
            for r in rows:
                r[k] = split(r[k])  # 4
    return rows
Exemple #5
0
def parse(path):
    # 1) Parse the Excel sheet at the given path (xlsx()).
    # 2) Map the list of lists to list of dicts (assoc()).
    # 3) If a column contains splitable values (e.g., "1,2,3"),
    # 4) split the values in the column.
    rows = list(assoc(xlsx(pd(path)))) # 1 + 2
    for k in rows[0].keys():
        if splitable(col(rows, k)):    # 3
            for r in rows:
                r[k] = split(r[k])     # 4
    return rows
Exemple #6
0
 def load_domains(self):
     """loads domain information"""
     sources_path = pd('data', 'source_data.csv')
     domain_file = Datasheet.load(sources_path, headers=True)
     for row in domain_file:
         url = row[1]
         if str(row[-1]).find("\""):
             cats = row[2:-1]
         else:
             cats = row[2:]
         self.cat_dict[url] = cats
Exemple #7
0
def enrolTwitter(thread, location):
    # This function searches for the thread based on the specified location, and cleans the results
    # It also enrols the "cleaned" data into a CSV based file for analysis
    # setting marker (RT) for removal (improves sentient analysis results)
    remove_list = ['RT']
    i = None  # initalizing placeholder value to store Tweet IDs (ensure uniqueness)
    for j in range(2):
        # Count controls the number of streams returned at one time
        # RTs count as unique tweets, as long as the handler ID is unique
        print "Iteration Number", (j + 1)  #for humans to understand lulz
        print
        for tweet in twitter.search(thread,
                                    geo=geocode(location)[:2],
                                    start=i,
                                    count=5,
                                    cached=False):
            # adds tweet if its ID doesn't exist previously
            if len(table) == 0 or tweet.id not in index:
                # a series of sentence level filters designed for Twitter handles
                # decomposes the tweet into words, and weeds for items in the remove_list
                dcomp_tweet = (tweet.text).split()
                # removes RTs from the tweet
                recombined_tweet = ' '.join(
                    [k for k in dcomp_tweet if k not in remove_list])
                #removes hashtag related content
                stringwithouthash = re.sub(r'#\w+ ?', '', recombined_tweet)
                #removes http/s related content
                stringwithouthttp = re.sub(r'http\S+', '', stringwithouthash)
                #removes @ related content
                finalstring = re.sub(r'@\w+ ?', '', stringwithouthttp)
                print finalstring
                # calls a function to analyze the string's sentiment value
                polarityVal, subjVal = checkSentiment(finalstring)
                # calls a simple function to analyze the string's certainty
                modalVal = checkModality(finalstring)
                table.append([
                    tweet.id, finalstring, location, polarityVal, subjVal,
                    modalVal, thread
                ])
                index.add(tweet.id)
            # Continues mining for older tweets (varied by j value) in the second iteration
            i = tweet.id

    # Commit saves to the parent directory of the Python file
    print
    print
    print
    print "Harvest complete - saving to:", os.getcwd()
    table.save(pd("analysis.csv"), headers=False)
    print

    print "Total unique entries in table:", len(table)
    print
    def getTweetSecureLoad(self, topic):
        # This example retrieves tweets containing given keywords from Twitter.

        self.search_topic = topic
        print 'CLASS (Twitter_PatternPKG) - Twitter Secure Initial Load - Topic: ' + self.search_topic
        self.search_topic = topic + ' film'
        try: 
            # We'll store tweets in a Datasheet.
            # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
            # In the first column, we'll store a unique id for each tweet.
            # We only want to add the latest tweets, i.e., those we haven't seen yet.
            # With an index on the first column we can quickly check if an id already exists.
            # The pd() function returns the parent directory of this script + any given path.

            table = Datasheet.load(pd(self.FILE_STORAGE))
            # index = set(table.columns[0])
            index = set(table.columns[4])   # on the text
            
        except:
            table = Datasheet()
            index = set()

        engine = Twitter(language="en")

        # With Twitter.search(cached=False), a "live" request is sent to Twitter:
        # we get the most recent results instead of those in the local cache.
        # Keeping a local cache can also be useful (e.g., while testing)
        # because a query is instant when it is executed the second time.
        prev = None

        #searchThisSubjects = search_topic

        # put headers
        table.append(["tweet_id", "tweet_date", "InputSubject", "Tweet_text"])

        #for oneSubject in searchThisSubjects:
        oneSubject = self.search_topic
        # oneSubject

        tweet_list_Json = []  # list of JSons
        tweet_list = []
        try:
            for i in range(1):
                for tweet in engine.search(oneSubject, start=prev, count=8, cached=False):
                    if 'http' in tweet.text:
                        posi = tweet.text.index('http')
                        tweet.text = tweet.text[0:posi-1]
                                
                    # Only add the tweet to the table if it doesn't already exists.
                    if len(table) == 0 or tweet.text not in index :
                        table.append([tweet.id, tweet.date, oneSubject, tweet.text])
                        index.add(tweet.text)
                        
                        tweet_list.append([tweet.id, tweet.date, oneSubject, tweet.text])
                        #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject, tweet.text)
                        #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text)
                        tweet.text = filter(lambda x: x in string.printable, tweet.text) # remove weird stuff
                        tweet.text = tweet.text.replace('"', '') # remove weird stuff
                        tweet.text = tweet.text.replace('\n', '') # remove weird stuff
                        tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) # remove artificiall film 
                        
                        tweet_list_Json.append(tweetJson)
                        #print tweetJson  
                        
                        # BUILD A JSON
                        #http://stackoverflow.com/questions/14547916/how-can-i-loop-over-entries-in-json
                        #BUILD A LIST OF DICTIONARIES                    
                        #http://stackoverflow.com/questions/2733813/iterating-through-a-json-object
                        
                        
                    # Continue mining older tweets in next iteration.
                    prev = tweet.text
    
        except Exception:
            print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! - ([twitter_patternPkg_connector] getTweetSecureLoad)'
            print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!!   (film: ' + oneSubject +')' 
            pass
        
        # Create a .csv in pattern/examples/01-web/
        # table.save(pd("OD_CK1_Source4_Tweeter_InitialLoad.csv"))
        print "CLASS (Twitter_PatternPKG) - Total Secure Twitter Load: " +  str(len(table)) + '\n'
        #print json.dumps(tweet_list)
        
        # return tweet_list
        return tweet_list_Json
# coding: utf-8

from pattern.web import Twitter
from pattern.db import Database, SQLITE
from pattern.db import pd
from pattern.db import field, pk, INTEGER, UNIQUE, STRING
from sqlite3 import IntegrityError

team = ['#galo', '#Galo', '#Atletico-MG', '#atletico mineiro']

twitter = Twitter()
db = Database(pd('tweets.db'))

if not "tweets" in db:	
	db.create("tweets", fields = (pk(), field('code', INTEGER, UNIQUE), field('text', STRING(140))))

#query in Twitter
for hashtag in team:
	for tweet in twitter.search(hashtag):
		try:
			db.tweets.append(code = tweet.id, text = tweet.text)
		except IntegrityError:
			pass

#Separate tweets in database
for data in db.tweets.filter():
	print data[2]
	print '-'*30

Exemple #10
0
from pattern.web import URL
from pattern.web import DOM
from pattern.web import plaintext

from pattern.db import Datasheet
from pattern.db import pd

feeds = {
    'boorish': 'http://feeds.feedburner.com/daily-star-Real-Life',
    'dramatic': 'http://feeds.feedburner.com/daily-star-Latest-News',
    'geeky': 'http://feeds.feedburner.com/daily-star-Tech',
    'dubious': 'http://feeds.feedburner.com/daily-star-Weird-News',
    'vulgar': 'http://feeds.feedburner.com/daily-star-Love-Sex',
}

PATH = pd('..', 'data', 'news2.csv')  # pd = parent directory of this script

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[0])
except:
    csv = Datasheet()
    seen = set()

for genre, url in feeds.items():
    for r in Newsfeed().search(url, cached=False):
        if r.url not in seen:
            print r.title
            print
            try:
                src = URL(r.url).download(cached=True)
Exemple #11
0
from pattern.en import sentiment  # sentiment library, to acquire polarity and subjectivity (is it positive/negative) and if its objective/subjective (tells how much of an opinion it is)
from pattern.en import modality  # acquire modality (degree of certainty between fact or opinion)
from pattern.web import cache
from pattern.db import Datasheet, pprint, pd, SUM, AVG, STDEV, INTEGER, STRING
import re  # the regex library for processing sentences
import datetime  # library for system datetime
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import time  #for sleep function
# brand new - patch 1.0b - matplot display
import numpy as np
import matplotlib.pyplot as plt

# Opens a CSV(comma-separated value) file for identifying and storing unique tweets
try:
    table = Datasheet.load(pd("analysis.csv"), headers=False)
    index = set(table.columns[0])
# exceptions here
except:
    table = Datasheet(fields=[("id", INTEGER), (
        "content", STRING), ("location",
                             STRING), ("polarity",
                                       STRING), ("subjectivity",
                                                 STRING), ("modality",
                                                           STRING)])
    index = set()

# Purged consumer keys and GPG-related content
# Declare Twitter object, to search for precise stream-based information
twitter = Twitter(license=None, throttle=0.5, language='en')
# The script produces "good-evil.csv", a dataset of 18,000+ tweets
# of which we know that people are discussing a good or an evil character.
# We can use it as training material to create a classifier that predicts
# good or evil for tweets that mention unknown characters.

# First we prepare the training data:

import re

URL = re.compile(r"https?://[^\s]+")           # http://www.emrg.be
REF = re.compile(r"@[a-z0-9_./]+", flags=re.I) # @tom_de_smedt

from pattern.db import Datasheet, pd

train = []
for name, alignment, tweet in Datasheet.load(pd("good-evil.csv")):
    tweet = URL.sub("http://", tweet) # Anonymize URLs.
    tweet = REF.sub("@friend", tweet) # Anonymize usernames.
    train.append((ngram_vector(tweet, 5), alignment))
    
# ------------------------------------------------------------------------------------

# Let's look at the statistical accuracy of the classifier:
print kfoldcv(SVM, train, folds=3)
print

# This returns an (accuracy, precision, recall, F1-score, stdev)-tuple.
# The F1-score is the most important.
# An SVM trained on our data would be 94.6% accurate in knowing good from evil
# (this is a suspiciously high accuracy).
Exemple #13
0
# The script produces "good-evil.csv", a dataset of 18,000+ tweets
# of which we know that people are discussing a good or an evil character.
# We can use it as training material to create a classifier that predicts
# good or evil for tweets that mention unknown characters.

# First we prepare the training data:

import re

URL = re.compile(r"https?://[^\s]+")  # http://www.emrg.be
REF = re.compile(r"@[a-z0-9_./]+", flags=re.I)  # @tom_de_smedt

from pattern.db import Datasheet, pd

train = []
for name, alignment, tweet in Datasheet.load(pd("good-evil.csv")):
    tweet = URL.sub("http://", tweet)  # Anonymize URLs.
    tweet = REF.sub("@friend", tweet)  # Anonymize usernames.
    train.append((ngram_vector(tweet, 5), alignment))

# ------------------------------------------------------------------------------------

# Let's look at the statistical accuracy of the classifier:
print kfoldcv(SVM, train, folds=3)
print

# This returns an (accuracy, precision, recall, F1-score, stdev)-tuple.
# The F1-score is the most important.
# An SVM trained on our data would be 94.6% accurate in knowing good from evil
# (this is a suspiciously high accuracy).
Exemple #14
0
from pattern.db import field, pk, STRING, INTEGER, DATE, NOW
from pattern.db import assoc
from pattern.db import rel
from pattern.db import pd # pd() = parent directory of current script.

# In this example, we'll build a mini-store:
# with products, customers and orders.
# We can combine the data from the three tables in an invoice query.

# Create a new database. 
# Once it is created, you can use Database(name) to access it.
# SQLite will create the database file in the current folder.
# MySQL databases require a username and a password.
# MySQL also requires that you install MySQLdb, see the installation instructions at:
# http://www.clips.ua.ac.be/pages/pattern-db
db = Database(pd("store.db"), type=SQLITE)
#db._delete()

# PRODUCTS
# Create the products table if it doesn't exist yet.
# An error will be raised if the table already exists.
# Add sample data.
if not "products" in db:
    # Note: in SQLite, the STRING type is mapped to TEXT (unlimited length).
    # In MySQL, the length matters. Smaller fields have faster lookup.
    schema = (
        pk(), # Auto-incremental id.
        field("description", STRING(50)),
        field("price", INTEGER)    
    )
    db.create("products", schema)
import sys, termios, tty, os, time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.db import Datasheet, pprint, pd
from pattern.en import sentiment, polarity, subjectivity, positive

try:
    # We'll store tweets in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each tweet.
    # We only want to add the latest tweets, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already exists.
    # The pd() function returns the parent directory of this script + any given path.
    table = Datasheet.load(pd("tweets.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

prev = '1071765537749917696'

counter = 0

while counter < 1000:

    counter += 1
    time.sleep(60)
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.db import Datasheet, pprint, pd
import random
# This example retrieves tweets containing given keywords from Twitter.

try:
    # We'll store tweets in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each tweet.
    # We only want to add the latest tweets, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already exists.
    # The pd() function returns the parent directory of this script + any given path.
    table = Datasheet.load(pd("eulogy.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.

search_term = 'beat'
prev = None
for i in range(2):
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.db import Datasheet, pprint, pd

# This example retrieves tweets containing given keywords from Twitter.

try:
    # We'll store tweets in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each tweet.
    # We only want to add the latest tweets, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already exists.
    # The pd() function returns the parent directory of this script + any given path.
    table = Datasheet.load(pd("black.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print(i)
    for tweet in engine.search("#blacklivesmatter",
Exemple #18
0
def parse_rows(path):
    rows = list(assoc(xlsx(pd(path)))) # 1 + 2
    #rows = xlsx(pd(path)) # 1 + 2
    return rows
Exemple #19
0
from pattern.db import field, pk, STRING, INTEGER, DATE, NOW
from pattern.db import assoc
from pattern.db import rel
from pattern.db import pd  # pd() = parent directory of current script.

# In this example, we'll build a mini-store:
# with products, customers and orders.
# We can combine the data from the three tables in an invoice query.

# Create a new database.
# Once it is created, you can use Database(name) to access it.
# SQLite will create the database file in the current folder.
# MySQL databases require a username and a password.
# MySQL also requires that you install MySQLdb, see the installation instructions at:
# http://www.clips.ua.ac.be/pages/pattern-db
db = Database(pd("store.db"), type=SQLITE)
# db._delete()

# PRODUCTS
# Create the products table if it doesn't exist yet.
# An error will be raised if the table already exists.
# Add sample data.
if not "products" in db:
    # Note: in SQLite, the STRING type is mapped to TEXT (unlimited length).
    # In MySQL, the length matters. Smaller fields have faster lookup.
    schema = (
        pk(),  # Auto-incremental id.
        field("description", STRING(50)),
        field("price", INTEGER))
    db.create("products", schema)
    db.products.append(description="pizza", price=15)
Exemple #20
0
def parse_rows(path):
    rows = list(assoc(xlsx(pd(path))))  # 1 + 2
    #rows = xlsx(pd(path)) # 1 + 2
    return rows
Exemple #21
0
# Put the file "SentiWordNet*.txt" in pattern/en/wordnet/
# You can then use Synset.weight() and wordnet.sentiwordnet:

#from pattern.en import wordnet, ADJECTIVE
#print wordnet.synsets("horrible", pos=ADJECTIVE)[0].weight # Yields a (polarity, subjectivity)-tuple.
#print wordnet.sentiwordnet["horrible"]

# For fine-grained analysis, 
# the return value of sentiment() has a special "assessments" property.
# Each assessment is a (chunk, polarity, subjectivity, label)-tuple,
# where chunk is a list of words (e.g., "not very good").

# The label offers additional meta-information.
# For example, its value is MOOD for emoticons:
try:
    table = Datasheet.load(pd("../../singleLife.csv"))
    index = set(table.columns[0])
except Exception as e:
    print e
    sys.exit()

for i in range(len(table)):
    text = table[i][1]
    sent = sentiment(text)
    
    print sent[0], sent[1], text
    table[i].append(sent[0])
    table[i].append(sent[1])
    

table.save(pd("cool.csv"))
w = Wiktionary(language="en")
f = csv()  # csv() is a short alias for Datasheet().

# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.
# The pd() function returns the parent directory of the current script,
# so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.

for gender in ("male", "female"):
    for ch in ("abcdefghijklmnopqrstuvwxyz"):
        p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
        for name in p.links:
            if not name.startswith("Appendix:"):
                f.append((name, gender[0]))
        f.save(pd("given-names.csv"))
        print(ch, gender)

# Create a classifier that predicts gender based on name.

from pattern.vector import SVM, chngrams, count, kfoldcv

class GenderByName(SVM):

    def train(self, name, gender=None):
        SVM.train(self, self.vector(name), gender)

    def classify(self, name):
        return SVM.classify(self, self.vector(name))

    def vector(self, name): 
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.db import Datasheet, pprint, pd

# This example retrieves tweets containing given keywords from Twitter.

try:
    # We'll store tweets in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each tweet.
    # We only want to add the latest tweets, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already exists.
    # The pd() function returns the parent directory of this script + any given path.
    table = Datasheet.load(pd("eulogy.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print(i)
    for tweet in engine.search("eulogy", start=prev, count=25, cached=False):
Exemple #24
0
from pattern.web import Twitter

# The pattern.db module has tools to work with data:
# SQLite and MySQL databases, .csv files, date parsers, ...
# The easiest way to store structured data is as a CSV
# ("comma-separated values"), a plain text file where
# each new line is a new row of data, and where columns
# are separated by ",".
# http://www.clips.ua.ac.be/pages/pattern-db#datasheet
from pattern.db import Datasheet
from pattern.db import pd

# The pd() function means:
# "there is a file search2-data.csv" in the same folder as this script".
PATH = pd("tweets.csv")
#print PATH

try:
    # If a .csv file already exists, open that one and append new data to it.
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[0])
except:
    # If a .csv file doesn't exist yet, create a new one.
    csv = Datasheet()
    seen = set()

# The "seen" variable is a set (= list of unique values)
# that contains the values in the first column in the CSV.
# In other words, it contains the id's of the tweets.
# We can use it to check if we have already seen a tweet,
    prev = None

    print "processing word:",word

    for tweet in twitter.search(word,start=prev,cached=False,count=200):

        # print
        #
        # print tweet.text
        # print tweet.author
        # print tweet.date
        # print hashtags(tweet.text)
        #
        # print
        clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet.text).split())

        if tweet.id not in index and clean_text not in texts:
            table.append([tweet.id,tweet.text,clean_text,hashtags(tweet.txt)])
            index.add(tweet.id)
            texts.add(clean_text)

        prev = tweet.id
#
table.save(pd("tweets_threats.csv"))


# pprint(table,truncate=100)


Exemple #26
0
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.db import Datasheet, pprint, pd

# This example retrieves tweets containing given keywords from Twitter.

try:
    # We'll store tweets in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each tweet.
    # We only want to add the latest tweets, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already exists.
    # The pd() function returns the parent directory of this script + any
    # given path.
    table = Datasheet.load(pd("cool.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print(i)
    for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
Exemple #27
0
    #(1, 'Radio Centraal') : 'https://redactie.radiocentraal.be/Home/feed/',
    #(1, 'Trouw') : 'https://www.trouw.nl/home/rss.xml',
    #('links', 'Marxisme.be') : 'https://nl.marxisme.be/marxisme-vandaag/feed/',
    #(1, 'Uitpers') : 'http://www.uitpers.be/feed/',
    #(1, 'Krapuul') : 'http://www.krapuul.nl/feed/',
    (-1, 'sceptr.net'):
    'https://sceptr.net/feed/',
    (-1, 're-act.be'):
    'http://www.krapuul.nl/feed/',
    (-1, 'eunmask.wordpress.com'):
    'https://eunmask.wordpress.com/feed/',
    (-1, 'ejbron.wordpress.com'):
    'https://ejbron.wordpress.com/feed/'
}

PATH = pd('news.csv')

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[-2])  # use url as id
except:
    csv = Datasheet()
    seen = set()

for (label, name), url in sources.items():
    try:
        f = Newsfeed()
        f = f.search(url, cached=False)
    except:
        continue
f = csv()  # csv() is a short alias for Datasheet().

# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.
# The pd() function returns the parent directory of the current script,
# so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.

for gender in ("male", "female"):
    for ch in ("abcdefghijklmnopqrstuvwxyz"):
        p = w.search("Appendix:%s_given_names/%s" %
                     (gender.capitalize(), ch.capitalize()),
                     cached=True)
        for name in p.links:
            if not name.startswith("Appendix:"):
                f.append((name, gender[0]))
        f.save(pd("given-names.csv"))
        print(ch, gender)

# Create a classifier that predicts gender based on name.

from pattern.vector import SVM, chngrams, count, kfoldcv


class GenderByName(SVM):
    def train(self, name, gender=None):
        SVM.train(self, self.vector(name), gender)

    def classify(self, name):
        return SVM.classify(self, self.vector(name))

    def vector(self, name):
Exemple #29
0
# Put the file "SentiWordNet*.txt" in pattern/en/wordnet/
# You can then use Synset.weight() and wordnet.sentiwordnet:

#from pattern.en import wordnet, ADJECTIVE
#print wordnet.synsets("horrible", pos=ADJECTIVE)[0].weight # Yields a (polarity, subjectivity)-tuple.
#print wordnet.sentiwordnet["horrible"]

# For fine-grained analysis,
# the return value of sentiment() has a special "assessments" property.
# Each assessment is a (chunk, polarity, subjectivity, label)-tuple,
# where chunk is a list of words (e.g., "not very good").

# The label offers additional meta-information.
# For example, its value is MOOD for emoticons:
try:
    table = Datasheet.load(pd("../../singleLife.csv"))
    index = set(table.columns[0])
except Exception as e:
    print e
    sys.exit()

for i in range(len(table)):
    text = table[i][1]
    sent = sentiment(text)

    print sent[0], sent[1], text
    table[i].append(sent[0])
    table[i].append(sent[1])

table.save(pd("cool.csv"))
    a = set() # set ~= list of unique values
    t = parsetree(s)
    for sentence in t:
        for word in sentence.words:
            if word.tag and word.tag == "JJ":
                a.add(word.string.lower())
    return list(sorted(a))

#print adjectives("I'm melting! Meeelting! What a wicked and cruel world!")
    
# ------------------------------------------------------------------------------------
# See tweets.py

csv = Datasheet()

PATH = pd("properties.csv")

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[0])
except:
    csv = Datasheet()
    seen = set()

twitter = Twitter(language="en", license=None)

for name in celebrities():
    id = None
    for tweet in twitter.search(name, start=id, count=100, cached=False):
        id = tweet.id
        if id not in seen:
Exemple #31
0
    t = parsetree(s)
    for sentence in t:
        for word in sentence.words:
            if word.tag and word.tag == "JJ":
                a.add(word.string.lower())
    return list(sorted(a))


#print adjectives("I'm melting! Meeelting! What a wicked and cruel world!")

# ------------------------------------------------------------------------------------
# See tweets.py

csv = Datasheet()

PATH = pd("properties.csv")

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[0])
except:
    csv = Datasheet()
    seen = set()

twitter = Twitter(language="en", license=None)

for name in celebrities():
    id = None
    for tweet in twitter.search(name, start=id, count=100, cached=False):
        id = tweet.id
        if id not in seen:
Exemple #32
0
    'http://rssfeeds.usatoday.com/usatoday-NewsTopStories',
    (0, '', 'real', 'Financial Times'):
    'http://www.ft.com/rss/world',
    (0, '', 'real', 'Associated Press'):
    'http://hosted2.ap.org/atom/APDEFAULT/3d281c11a96b4ad082fe88aa0db04305',
    (0, '', 'real', 'The Diplomat'):
    'http://thediplomat.com/feed/',
    (0, '', 'real', 'United Press International'):
    'http://rss.upi.com/news/news.rss',
    (0, '', 'joke', 'The Onion'):
    'http://www.theonion.com/feeds/rss',
    (4, 'right', 'joke', 'National Report'):
    'http://feeds.feedburner.com/NationalReport',
}

PATH = pd('..', 'data', 'news1.csv')

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[-2])  # use url as id
except:
    csv = Datasheet()
    seen = set()

for (level, bias, label, name), url in sources.items():
    try:
        f = Newsfeed()
        f = f.search(url, cached=False)
    except:
        continue
Exemple #33
0
# in which each item is a list of column values.
# For example:
# [["trope1", "movie1, movie2, ...", "description"],
#  ["trope2", "movie3, movie3, ...", "description"]
# ]

# The pd() function means "parent directory".
# It points to the folder that contains the script you are looking at.
# So, if you have a "data.csv" file in the same folder as this script,
# you can reference it from this script with pd("data.csv").

tropes = {}  # {trope1: [movie1, movie2, ...], ...}
movies = {}  # {movie1: [trope1, trope2, ...], ...}

# Read each row in the .csv file.
for trope, examples, description in Datasheet.load(pd("tropes.csv")):
    # The examples of movies that use this trope are separated by a newline (\n).
    # Split the string into a list:
    examples = examples.split("\n")
    # Add each new trope to the tropes dictionary.
    if not trope in tropes:
        tropes[trope] = set(
        )  # set() is like a list, but never contains duplicates.
    # Add each new movie to the movies dictionary.
    for movie in examples:
        if not movie in movies:
            movies[movie] = set()
        movies[movie].add(trope)
        tropes[trope].add(movie)

print len(tropes), "tropes"
# This requires a personal license key.
# If you are logged in to Facebook, you can get a license key here:
# http://www.clips.ua.ac.be/pattern-facebook
# (We don't / can't store your information).

# 1) Searching for public status updates.
#    Search for all status updates that contain the word "horrible".

try:
    # We'll store the status updates in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each status update.
    # We only want to add new status updates, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already
    # exists.
    table = Datasheet.load(pd("opinions.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

fb = Facebook()

# With Facebook.search(cached=False), a "live" request is sent to Facebook:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
for status in fb.search("horrible", count=25, cached=False):
    print("=" * 100)
    print(status.id)
    print(status.text.encode("utf-8"))
# ------------------------------------------------------------------------------------

# This example demonstrates a semantic network of common sense.
# A semantic network is a graph where nodes represent concepts
# and edges (= connections between nodes) represent semantical
# relations (e.g., "is-a", "is-part-of", "is-property-of", ...)

# The data was collected manually and consists of about 10,000
# triples (concept1 -> relation -> concept2).
# The visual tool for adding new triples is online at:
# http://nodebox.net/perception

# The data is bundled in Pattern as a .csv file.

from pattern.graph import MODULE # path to pattern/graph/commonsense.csv
data = pd(MODULE, "commonsense.csv")
data = Datasheet.load(data)

# Create the graph:

g = Graph()
for concept1, relation, concept2, context, weight in data:
    g.add_node(concept1)
    g.add_node(concept2)
    g.add_edge(concept1, concept2, type=relation, weight=min(int(weight) * 0.1, 1.0))

# ------------------------------------------------------------------------------------

# The halo of a node is a semantical representation of a concept.
# The halo is made up of other concepts directly or indirectly related to this concept,
# defining it.