Beispiel #1
0
from pattern.web   import Twitter, Google, plaintext
from pattern.table import Table
t = Table()
for nomme, categorie in (("l'arnacoeur", "film"), ("le nom des gens", "film"), ("the ghost writer", "film"), ("tournée", "film"), ("des hommes et des dieux", "film"), ("gainsbourg, vie héroique", "film"), ("mammuth", "film")):
    for tweet in Twitter().search(nomme):
        s = plaintext(tweet.description)        
        t.append([nomme, film, tweet.date, s])
Beispiel #2
0
from pattern.table import Table
from pattern.table import uid, pprint

# The main purpose of the pattern module is to facilitate automated processes
# for (text) data acquisition and (linguistical) data mining.
# Often, this involves a tangle of messy text files and custom formats to store the data.
# The Table class offers a useful datasheet (cfr. MS Excel) in Python code.
# It can be saved as a CSV text file that is both human/machine readable.
# See also: examples/01-web/03-twitter.py
# Supported values that are imported and exported correctly:
# str, unicode, int, float, bool, None
# For other data types, custom encoder and decoder functions can be used.

t = Table(rows=[
    [uid(), "broccoli",  "vegetable"],
    [uid(), "turnip",    "vegetable"],
    [uid(), "asparagus", "vegetable"],
    [uid(), "banana",    "fruit"    ],
])

print t.rows[0]    # A list of rows.
print t.columns[1] # A list of columns, where each column is a list of values.
print

# Columns can be manipulated directly like any other Python list.
# This can be slow for large tables. If you need a fast way to do matrix math,
# use numpy (http://numpy.scipy.org/) instead. 
# The purpose of Table is data storage.
t.columns.append([
    "green",
    "purple",
    "white",
Beispiel #3
0
import os, sys

sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.table import Table
from pattern.table import uid, pprint, COUNT, FIRST

# This example demonstrates how table values can be grouped.

t = Table(rows=[
    #   0-ID    1-NAME       2-TYPE       3-COLOR
    [uid(), "broccoli", "vegetable", "green"],
    [uid(), "turnip", "vegetable", "purple"],
    [uid(), "asparagus", "vegetable", "white"],
    [uid(), "banana", "fruit", "yellow"],
    [uid(), "orange", "fruit", "orange"]
])

g = t.copy(columns=[2, 0])  # A copy with only the type and id columns.
g = g.group(0, COUNT)  # Group by type, count rows per type.
# Group functions: FIRST, LAST, COUNT, MAX, MIN, SUM, AVG, STDEV.
pprint(g)
print

# This will group by type and concatenate all names per type:
g = t.copy(columns=[2, 1])
g = g.group(0, function=lambda list: "/".join(list))

pprint(g)
print
import os, sys

sys.path.append(os.path.join("..", "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.table import Table, pprint

# This example retrieves tweets containing given keywords from Twitter (http://twitter.com).

try:
    # We store tweets in a Table that can be saved as a text file.
    # In the first column, we'll store a unique ID for each tweet.
    # We only want to add the latest tweets, i.e. those we haven't previously encountered.
    # With an index() on the first column we can quickly check if an ID already exists.
    # The index becomes important once more and more rows are added to the table (speed).
    table = Table.load("cool.txt")
    index = table.index(table.columns[0])
except:
    table = Table()
    index = {}

engine = Twitter()

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description)  # Keywords in tweets start with a #.
    print
Beispiel #5
0
import os, sys; sys.path.append(os.path.join("..", "..", ".."))

from pattern.table import Table
from pattern.table import uid, pprint, COUNT, FIRST

# This example demonstrates how table values can be grouped.

t = Table(rows=[
#   0-ID    1-NAME       2-TYPE       3-COLOR
    [uid(), "broccoli",  "vegetable", "green" ],
    [uid(), "turnip",    "vegetable", "purple"],
    [uid(), "asparagus", "vegetable", "white" ],
    [uid(), "banana",    "fruit",     "yellow"],
    [uid(), "orange",    "fruit",     "orange"]
])

g = t.copy(columns=[2,0]) # A copy with only the type and id columns.
g = g.group(0, COUNT)     # Group by type, count rows per type.
                          # Group functions: FIRST, LAST, COUNT, MAX, MIN, SUM, AVG, STDEV.
pprint(g)
print

# This will group by type and concatenate all names per type:
g = t.copy(columns=[2,1])
g = g.group(0, function=lambda list: "/".join(list))

pprint(g)
print

# This will group by type, count the id's per type, and concatenate all names per type.
# Each column is given a different grouping function.
Beispiel #6
0
from pattern.search import Pattern
from pattern.table  import Table, pprint

# "X IS MORE IMPORTANT THAN Y"
# Here is a rough example of how to build a web miner.
# It mines comparative statements from Yahoo! and stores the results in a table,
# which can be saved as a text file for further processing later on.

# Pattern matching also works with Sentence objects from the MBSP module.
# MBSP's parser is much more robust (but also slower).
#from MBSP import Sentence, parse

q = '"more important than"'          # Yahoo search query
p = "NP (VP) more important than NP" # Search pattern.
p = Pattern.fromstring(p)
t = Table()

engine = Yahoo(license=None)
for i in range(1): # max=10
    for result in engine.search(q, start=i+1, count=100, cached=True):
        s = result.description
        s = plaintext(s)
        s = Sentence(parse(s))
        for m in p.search(s):
            a = m.constituents(constraint=0)[-1] # Left NP.
            b = m.constituents(constraint=5)[ 0] # Right NP.
            t.append((
                a.string.lower(), 
                b.string.lower()))

pprint(t)
Beispiel #7
0
from pattern.search import Pattern
from pattern.table import Table, pprint

# "X IS MORE IMPORTANT THAN Y"
# Here is a rough example of how to build a web miner.
# It mines comparative statements from Yahoo! and stores the results in a table,
# which can be saved as a text file for further processing later on.

# Pattern matching also works with Sentence objects from the MBSP module.
# MBSP's parser is much more robust (but also slower).
#from MBSP import Sentence, parse

q = '"more important than"'  # Yahoo search query
p = "NP (VP) more important than NP"  # Search pattern.
p = Pattern.fromstring(p)
t = Table()

engine = Yahoo(license=None)
for i in range(1):  # max=10
    for result in engine.search(q, start=i + 1, count=100, cached=True):
        s = result.description
        s = plaintext(s)
        s = Sentence(parse(s))
        for m in p.search(s):
            a = m.constituents(constraint=0)[-1]  # Left NP.
            b = m.constituents(constraint=5)[0]  # Right NP.
            t.append((a.string.lower(), b.string.lower()))

pprint(t)

print
Beispiel #8
0
from pattern.web import Twitter, Google, plaintext
from pattern.table import Table
t = Table()
for nomme, categorie in (("l'arnacoeur", "film"), ("le nom des gens", "film"),
                         ("the ghost writer", "film"), ("tournée", "film"),
                         ("des hommes et des dieux",
                          "film"), ("gainsbourg, vie héroique",
                                    "film"), ("mammuth", "film")):
    for tweet in Twitter().search(nomme):
        s = plaintext(tweet.description)
        t.append([nomme, film, tweet.date, s])
Beispiel #9
0
from pattern.web   import Twitter, Google, plaintext
from pattern.table import Table
t = Table()
for politician, party in (("nicolas sarkozy", "ump"), ("dsk", "ps")):
    for tweet in Twitter().search(politician):
        if tweet.language in ("nl", "fr"):
            s = plaintext(tweet.description)
            s = Google().translate(s, tweet.language, "en")
#            w = sum([sentiment_score(word) for word in s.split(" ")])
            t.append([politician, party, tweet.date, s])
Beispiel #10
0
import os, sys
sys.path.append(os.path.join("..", "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.table import Table, pprint

# This example retrieves tweets containing given keywords from Twitter (http://twitter.com).

try:
    # We store tweets in a Table that can be saved as a text file.
    # In the first column, we'll store a unique ID for each tweet.
    # We only want to add the latest tweets, i.e. those we haven't previously encountered.
    # With an index() on the first column we can quickly check if an ID already exists.
    # The index becomes important once more and more rows are added to the table (speed).
    table = Table.load("cool.txt")
    index = table.index(table.columns[0])
except:
    table = Table()
    index = {}

engine = Twitter()

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description)  # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
Beispiel #11
0
from pattern.table import Table
from pattern.table import uid, pprint

# The main purpose of the pattern module is to facilitate automated processes
# for (text) data acquisition and (linguistical) data mining.
# Often, this involves a tangle of messy text files and custom formats to store the data.
# The Table class offers a useful datasheet (cfr. MS Excel) in Python code.
# It can be saved as a CSV text file that is both human/machine readable.
# See also: examples/01-web/03-twitter.py
# Supported values that are imported and exported correctly:
# str, unicode, int, float, bool, None
# For other data types, custom encoder and decoder functions can be used.

t = Table(rows=[
    [uid(), "broccoli", "vegetable"],
    [uid(), "turnip", "vegetable"],
    [uid(), "asparagus", "vegetable"],
    [uid(), "banana", "fruit"],
])

print t.rows[0]  # A list of rows.
print t.columns[1]  # A list of columns, where each column is a list of values.
print

# Columns can be manipulated directly like any other Python list.
# This can be slow for large tables. If you need a fast way to do matrix math,
# use numpy (http://numpy.scipy.org/) instead.
# The purpose of Table is data storage.
t.columns.append(["green", "purple", "white", "yellow"])

# Save as a comma-separated (unicode) text file.
t.save("food.txt")