def main():
    logging.basicConfig(level=logging.INFO)

    argparser = ArgumentParser(description=__doc__)
    argparser.add_argument("-t",
                           "--trainset",
                           action="store",
                           default=None,
                           help=("Path to training data "
                                 "[default: %(default)s]"))
    argparser.add_argument("-m",
                           "--model",
                           action="store",
                           help="Path to model")
    argparser.add_argument("-d",
                           "--dump",
                           action="store_true",
                           help="Pickle trained model? [default: False]")
    argparser.add_argument("-v",
                           "--verbose",
                           action="store_true",
                           default=False,
                           help="Verbose [default: quiet]")
    argparser.add_argument("-c",
                           "--classify",
                           action="store",
                           default=None,
                           help=("Path to data to classify "
                                 "[default: %(default)s]"))
    argparser.add_argument("-s",
                           "--save",
                           action="store",
                           default='output.csv',
                           help=("Path to output file"
                                 "[default = output.csv]"))
    args = argparser.parse_args()

    clf = SensationalismClassifier(train_data=args.trainset,
                                   model=args.model,
                                   dump=args.dump,
                                   debug=args.verbose)

    if args.classify:
        OUTPUT_PATH = args.save

        if clf.debug:
            tick = time()
        to_classify = Datasheet.load(args.classify)
        classified_data = clf.classify(to_classify)
        output = Datasheet(classified_data)
        output.save(pd(OUTPUT_PATH))

        if clf.debug:
            sys.stderr.write("\nProcessed %d items in %0.2fs" %
                             (len(classified_data), time() - tick))
Exemple #2
0
def scrape_news_text(news_url):

    global counter

    news_html = requests.get(news_url).content

    #    print(news_html)
    '''convert html to BeautifulSoup object'''
    news_soup = BeautifulSoup(news_html, 'lxml')
    # soup.find("div", {"id": "articlebody"})
    #    paragraphs = [par.text for par in news_soup.find_all('p')]
    #    news_text = '\n'.join(paragraphs)

    #    print(news_soup.find("div", {"id": "articleText"}))

    date_object = news_soup.find(itemprop="datePublished")
    news_object = news_soup.find("div", {"id": "articleText"})

    if date_object is None:
        return "  "

    if news_object is None:
        return "   "

    news_date = date_object.get_text(
    )  #   find("div", {"id": "articleText"}).text
    news_text = news_object.text

    #    print(news_date)
    #    print(news_text)
    print(news_url)

    try:
        # We'll store tweets in a Datasheet.
        # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
        # In the first column, we'll store a unique id for each tweet.
        # We only want to add the latest tweets, i.e., those we haven't seen yet.
        # With an index on the first column we can quickly check if an id already exists.
        # The pd() function returns the parent directory of this script + any given path.
        table = Datasheet.load(pd("nasdaq2.csv"))
    except:
        table = Datasheet()

    news_sentiment = sentiment(news_text)

    print(news_sentiment)

    table.append([counter, news_date, news_url, news_sentiment])

    table.save(pd("nasdaq2.csv"))

    counter += 1

    return news_text
Exemple #3
0
def main():
	table = Datasheet()

	for cap in CAPS:
		url = 	URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap)
		connection = url.open()
		doc = Document( connection.read() )
		items = doc.by_tag("table")
		row = []
		for j, td in enumerate( items[5].by_tag('td') ):
			strcap = "%s, Telefono:" % cap
			save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n"
			if save != None:
				row.append( save )
		table.append( row )
		print  "%s ----------------------------------------------------------------------------" % str(j)
		
	table.save("files/farmacie_torino.txt")
Exemple #4
0
def main():
	table = Datasheet()

	url = 	URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html")
	connection = url.open()
	doc = Document( connection.read() )
	items = doc.by_class('ulamm')[1:]
	row = []
	for ul in items:
		li = ul.by_tag('li')
		kind = plaintext(ul.previous.content)
		for el in li:
			if el != None:
				save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, )
				row.append(save)
	table.append( row )
		
	table.save("files/h_torino.txt")
Exemple #5
0
    def dumpdata(self):
        ''' Utility method to dump data in a csv file for later upload to the
        final database. Final database fields is found below.
        ---------------------------------------------------------------------
            1.) text = models.CharField(max_length=200)
            2.) owner = models.CharField(max_length=20)
            3.) label = models.CharField(max_length=20)
            4.) usage = models.CharField(max_length=20)
            5.) disease_type = models.CharField(max_length=20, null=True)
            6.) urlentity = models.CharField(max_length=20)
            7.) hashtagentity = models.CharField(max_length=20)
            8.) tweet_time = models.DateTimeField(db_index=True, default=datetime.now)
            9.) location= models.ForeignKey(Location, null=True, blank=True)
            10.) location_string = models.CharField(max_length=20, null=True)
            11.) from_lang = models.CharField(max_length=20)
            12.) lat
            13.) lng
            14.) country
        ''' 
        try: 
        # We extract information from database and store in a csv
            
            data_dump = Datasheet.load("corpora/twitter/datadump2.csv")
            index = dict.fromkeys(data_dump[0], True)

        except:
            data_dump = Datasheet()
            index = {}
        
        for tweet in epi.models.Tweet.objects.all(): 
            id = str(hash(tweet.owner + tweet.text))   
            
            if len(data_dump) == 0 or id not in index:
                data_dump.append([id, tweet.text, tweet.owner, tweet.label, \
                tweet.usage, '', tweet.urlentity, tweet.tweet_time,\
                '', tweet.location, ''])
                index[id] = True
                
            data_dump.save("corpora/twitter/datadump2.csv")
    def classify(self, document):
        ''' This method is used to classify new documents. Uses the saved model.
        '''
        
        #Loading csv predictions and corpora documents.
        try: 
            nb_predictions = Datasheet.load("predictions/NB/patterns_nb.csv")
            nb_corpus = Datasheet.load("corpora/NB/nb.csv")

            index_pred = dict.fromkeys(nb_predictions.columns[0], True)
            index_corp = dict.fromkeys(nb_corpus.columns[0], True)
        except:
            nb_predictions = Datasheet()
            nb_corpus = Datasheet()
            index_pred = {}
            index_corp = {}

        #Load model from file system
        classifier = Classifier.load('models/nb_model.ept')
        label = classifier.classify(Document(document))
        probability = classifier.classify(Document(document), discrete=False)[label]

        id = str(hash(label + document))

        if ("positive" in label):
            if len(nb_predictions) == 0 or id not in index_pred:
                nb_predictions.append([id, label, document, probability])
                index_pred[id] = True
                
        if len(nb_corpus) == 0 or id not in index_corp:
            nb_corpus.append([id, label, document, probability])
            index_corp[id] = True

        nb_predictions.save("predictions/NB/patterns_nb.csv")
        nb_corpus.save("corpora/NB/nb.csv")

        return label
    def classify(self, document):
        ''' This method is used to classify new documents. Uses the saved model.
        '''
        
        #Loading csv predictions and corpora documents.
        try: 
            svm_predictions = Datasheet.load("predictions/svm.csv")
            svm_corpus = Datasheet.load("corpora/svm/svm.csv")

            index_pred = dict.fromkeys(svm_predictions.columns[0], True)
            index_corp = dict.fromkeys(svm_corpus.columns[0], True)
        except:
            svm_predictions = Datasheet()
            svm_corpus = Datasheet()
            index_pred = {}
            index_corp = {}

        #Load model from file system
        classifier = Classifier.load('models/svm_model2.ept')
        label = classifier.classify(Document(document))

        id = str(hash(label + document))

        if ("positive" in label):
            if len(svm_predictions) == 0 or id not in index_pred:
                svm_predictions.append([id, label, document])
                index_pred[id] = True
                
        if len(svm_corpus) == 0 or id not in index_corp:
            svm_corpus.append([id, label, document])
            index_corp[id] = True

        svm_predictions.save("predictions/svm.csv")
        svm_corpus.save("corpora/svm/svm.csv")

        return label
Exemple #8
0
id = None
for i in range(3):
    # Look for tweets containing the search query.
    # We can get a maximum of 100 tweets per search.
    # Don't cache the results locally,
    # so we get the latest new tweets when the script runs.
    # Do this 3x.
    for tweet in twitter.search(q, start=id, count=100, cached=False):
        id = tweet.id
        if id not in seen:
            print tweet.text
            print
            seen.add(id)
            csv.append([
                tweet.id, q, tweet.author, tweet.text, tweet.retweets,
                tweet.date
            ])
    # Update the CSV.
    csv.save(PATH)

# Each time you run the script, new tweets will be appended to the CSV.
# For example, we have Twitter miners that automatically run 10x each day,
# and have been running for many days and weeks.

# We can then import the data in other scripts, e.g.:

#from pattern.db import Datasheet, pd
#csv = Datasheet.load(pd("tweets.csv"))
#for id, q, author, text, retweets, date in csv:
#    print text
Exemple #9
0
                    score = review.by_class("swSprite")[0]
                    score = score.attributes["title"]
                    score = score.split(" ")[0]
                    score = float(score)

                    # The review is contained as plain text in the <div>.
                    text = ""
                    for child in review.children:
                        if child.type == "text":
                            text += child.source + " "
                    text = text.strip()
                    text = plaintext(text)  # Remove HTML entities, tags, etc.

                    if text:
                        corpus.append((text, score))
                        print score
                        print text
                        print

                except Exception, e:
                    #print e
                    pass

        # Now and then, save the corpus of (review, score) items as a .csv file.
        corpus.save("books-fr.csv")

# Can you think of other test data to mine for?
# Can you see why it would be useful to have different test sets?
# - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag?
# - How about hotel reviews + star rating from http://fr.hotels.com?
# - ...
Exemple #10
0
    # In the first column, we'll store a unique ID for each tweet.
    # We only want to add the latest facebook status, i.e. those we haven't previously encountered.
    # With an index on the first column we can quickly check if an ID already exists.
    # The index becomes important once more and more rows are added to the table (speed).
    table = Datasheet.load("travel.txt")
    index = dict.fromkeys(table.columns[0], True)
except:
    table = Datasheet()
    index = {}

engine = Facebook()

# With cached=False, a live request is sent to Facebook,
# so we get the latest results for the query instead of those in the local cache.
for status in engine.search("Travelling to", count=25, cached=False):
    print status.description
    print status.author
    print status.date
    print
    id = status.url
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, status.description])
        index[id] = True

table.save("travel.txt")

print "Total results:", len(table)
print

Exemple #11
0
    index2 = dict.fromkeys(table.columns[1], True)
except:
    table = Datasheet()
    index = {}
    index2 = {}


engine = Twitter(language="en")
comparray=[" "] #spam filter 
# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for i in range(1, 10000):
    for tweet in Twitter().search('volcano OR Sicily AND etna +exclude:retweets', start=i, count=100):
        comparray.append(tweet.text[0:15])
        print tweet.text
        print tweet.author
        print tweet.date
        print hashtags(tweet.text) # Keywords in tweets start with a #.
        print
        # Create a unique ID based on the tweet content and author.
        id = str(hash(tweet.author + tweet.text))
        # Only add the tweet to the table if it doesn't already contain this ID.     
        if len(table) == 0 or id not in index and sentiment(tweet.text)[0]!= 0 and comparray[-1]!=comparray[-2]:
            table.append([id,tweet.author, tweet.date, tweet.text, sentiment(tweet.text)[0]])
            index[id] = True

table.save("tweets.csv")

print "Total results:", len(table)
print
for i in range(2):
    print(i)
    for tweet in engine.search("eulogy", start=prev, count=25, cached=False):
        print("")
        print(tweet.text)
        print(tweet.author)
        print(tweet.date)
        print(hashtags(tweet.text))  # Keywords in tweets start with a "#".
        print("")
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("eulogy_july_21.csv"))

print("Total results: %s" % len(table))
print("")

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further analysis, ...

pprint(table, truncate=100)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
Exemple #13
0
      ("id", INTEGER), # Define the column headers.
    ("name", STRING),
    ("type", STRING)
])

print ds.rows[0]    # A list of rows.
print ds.columns[1] # A list of columns, where each column is a list of values.
print ds.name
print

# Columns can be manipulated directly like any other Python list.
# This can be slow for large tables. If you need a fast way to do matrix math,
# use numpy (http://numpy.scipy.org/) instead. 
# The purpose of Table is data storage.
ds.columns.append([
    "green",
    "purple",
    "white",
    "yellow"
], field=("color", STRING))

# Save as a comma-separated (unicode) text file.
ds.save("food.txt", headers=True)

# Load a table from file.
ds = Datasheet.load("food.txt", headers=True)

pprint(ds, truncate=50, padding=" ", fill=".")
print
print ds.fields
Exemple #14
0
engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description)  # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = str(hash(tweet.author + tweet.description))
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.csv")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
# We can also open the table later on, in other scripts, for further analysis.
#pprint(table)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
Exemple #15
0
engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description) # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = hash(tweet.author + tweet.description)
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.txt")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
# We can also open the table later on, in other scripts, for further analysis.
#pprint(table)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
Exemple #16
0
fb = Facebook()

# With cached=False, a live request is sent to Facebook,
# so we get the latest results for the query instead of those in the local cache.
for status in fb.search("horrible", count=25, cached=False):
    print status.text
    print status.author
    print status.date
    print
    id = status.id
    # Only add the status update to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, status.text])
        index[id] = True

table.save("negative.txt")

# 2) Status updates from profiles.

# You need a personal license key first:
# http://www.clips.ua.ac.be/media/pattern-fb.html
license = ""

if license != "":
    fb = Facebook(license)
    # Facebook.profile() returns an (id, name, date of birth, gender, locale)-tuple.
    # By default, this is your own profile.
    # You can also supply the id of another profile.
    me = fb.profile()[0]
    for status in fb.search(me, type=NEWS, count=10, cached=False):
        print status.id  # Status update unique ID.
__author__ = 'Nitin'
from collocations import get_knowledge_from_collocations
from associations import get_knowledge_from_associations
from common_sense import get_knowledge_from_pattern_common_sense
from wordnet_nyms import get_knowledge_from_wordnet
from pattern.db import Datasheet
import logging

logging.basicConfig(level=logging.INFO)
logging.getLogger(__name__)

g = []
get_knowledge_from_collocations(g)
get_knowledge_from_associations(g)
get_knowledge_from_pattern_common_sense(g)
#get_knowledge_from_wordnet(g)
knowledge = [tuple([head.strip(), tail.strip(), relation]) for head, tail, relation in g if
             '.' in head and '.' in tail and '/' not in head and '/' not in tail and '(' not in head and '(' not in tail
             and ')' not in head and ')' not in tail and not head.startswith('.') and not tail.startswith('.')]

logging.info('Memorising...')
ds = Datasheet()
for speck in knowledge:
    ds.append(speck)

ds.save('knowledge.csv')
Exemple #18
0
#print adjectives

# We now want to sort the dictionary by frequency.
# The items() method of a Python dictionary returns a list of (key, value)-tuples.
# In our case, (lemma, [frequency, [form1, form2, ...]]), for example:
# ("beau", [620.07, ["beau", "beaux", "belle", "belles"]])
# We'll make a new list with the frequency at the start of each tuple.
# We can then sort the list by frequency.
adjectives = adjectives.items()
adjectives = [(weight, lemma, forms) for lemma, (weight, forms) in adjectives]
adjectives = sorted(adjectives, reverse=True) # Highest-first.
#print adjectives

# We want to save our list of adjectives as a new corpus.
# Something more manageable than 24MB.
# I prefer a new .csv file with two fields: lemma, and forms (comma-separated).
# Adjectives higher up in the list are more frequent,
# we should deal with those first to get a good coverage.
corpus = Datasheet()
for frequency, lemma, forms in adjectives:
    field1 = lemma
    field2 = ",".join(forms) # Collapse list to comma-separated string.
    corpus.append( [field1, field2] )
corpus.save("adj-fr.csv")

# We end up with a 500KB list of words commonly used to express emotion or opinion,
# sorted by how often they occur in books,
# along with their inflected forms (gender/number, such as "belles").
# The top 10 most frequent adjectives are: 
# "tout", "petit", "grand", "seul", "autre", "meme", "bon", "premier", "beau", "jeune", ...
Exemple #19
0
        # 4) Remove HTML tags:
        try:
            s = plaintext(e)
            s = s.strip()
        except:
            continue

        #if not s:
        #    print r.url
        #    print
        #    continue

        # 5) Save to CSV:
        if r.url not in seen:
            seen.add(r.url)
            csv.append((
                name,
                r.title,
                s,
                label,
            ))
            print(name, r.title)

    csv.save(pd(PATH))

# To read the dataset:
#for name, label, article in Datasheet.load(PATH):
#print(name, label, article)
#datasheet = Datasheet.load(PATH)
#print(datasheet)
    tokens = []
    for i in range(0,len(word_tokenize(tweet))): 
        if word_tokenize(tweet)[i] == '@':
            tokens.append( str('@' + word_tokenize(tweet)[i+1]) )
        if word_tokenize(tweet)[i] == '#':
            tokens.append( str('#' + word_tokenize(tweet)[i+1]) ) 
    new_bigrams = nltk.bigrams(tokens)
    for bigram in new_bigrams: 
        bigram_table.append(bigram)
    for token in tokens:
        all_tokens.extend(tokens)

token_freq = nltk.FreqDist(all_tokens)

str_today = 'tweet_graph_' + str(date.today().day) + '-' + str(date.today().month) + '-' + str(date.today().year) + '.csv'
bigram_table.save(str_today)

new_twitter_subjects = list(set(all_tokens))
another_table = Datasheet()

# save the original list of twitter users, we'll use this in cytoscape 
spamWriter = csv.writer(open('original_twitter.csv', 'wb'), delimiter=' ', quotechar='|')
for i in list(set(all_tokens)): 
    spamWriter.writerow([i, 1])

for twitter_subject in new_twitter_subjects:
    # With cached=False, a live request is sent to Twitter,
    # so we get the latest results for the query instead of those in the local cache.
    for tweet in engine.search(twitter_subject, count=275, cached=False):
        # Create a unique ID based on the tweet content and author.
        id = hash(tweet.author + tweet.description)
    seen = set()

twitter = Twitter(language="en", license=None)

for name in celebrities():
    id = None
    for tweet in twitter.search(name, start=id, count=100, cached=False):
        id = tweet.id
        if id not in seen:
            print name, tweet.text
            print
            seen.add(id)
            for w in adjectives(tweet.text):
                if not w.startswith(("@", "~", "1", "2")): # filter out weirdness
                    csv.append([tweet.id, name, w])
    csv.save(PATH)

# ------------------------------------------------------------------------------------
# Dataset reader.

PATH = pd("properties.csv")

f = {} # {celebrity: {property: count}}
for id, name, p in Datasheet.load(PATH):
    if name not in f:
        f[name] = {}     # {"Justin Bieber": {}}
    if p not in f[name]:
        f[name][p] = 0   # {"Justin Bieber": {"gay": 0}}
    f[name][p] += 1      # {"Justin Bieber": {"gay": 1}}
        
#print f["Eminem"]
Exemple #22
0
    print(i)
    for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
        print()
        print(tweet.text.encode("utf-8"))
        print(tweet.author)
        print(tweet.date)
        print(hashtags(tweet.text))  # Keywords in tweets start with a "#".
        print()
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))

print("Total results:", len(table))
print()

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further
# analysis, ...

pprint(table, truncate=100)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
    prev = None

    print "processing word:",word

    for tweet in twitter.search(word,start=prev,cached=False,count=200):

        # print
        #
        # print tweet.text
        # print tweet.author
        # print tweet.date
        # print hashtags(tweet.text)
        #
        # print
        clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet.text).split())

        if tweet.id not in index and clean_text not in texts:
            table.append([tweet.id,tweet.text,clean_text,hashtags(tweet.txt)])
            index.add(tweet.id)
            texts.add(clean_text)

        prev = tweet.id
#
table.save(pd("tweets_threats.csv"))


# pprint(table,truncate=100)


Exemple #24
0
    for i in range(0, len(word_tokenize(tweet))):
        if word_tokenize(tweet)[i] == '@':
            tokens.append(str('@' + word_tokenize(tweet)[i + 1]))
        if word_tokenize(tweet)[i] == '#':
            tokens.append(str('#' + word_tokenize(tweet)[i + 1]))
    new_bigrams = nltk.bigrams(tokens)
    for bigram in new_bigrams:
        bigram_table.append(bigram)
    for token in tokens:
        all_tokens.extend(tokens)

token_freq = nltk.FreqDist(all_tokens)

str_today = 'tweet_graph_' + str(date.today().day) + '-' + str(
    date.today().month) + '-' + str(date.today().year) + '.csv'
bigram_table.save(str_today)

new_twitter_subjects = list(set(all_tokens))
another_table = Datasheet()

# save the original list of twitter users, we'll use this in cytoscape
spamWriter = csv.writer(open('original_twitter.csv', 'wb'),
                        delimiter=' ',
                        quotechar='|')
for i in list(set(all_tokens)):
    spamWriter.writerow([i, 1])

for twitter_subject in new_twitter_subjects:
    # With cached=False, a live request is sent to Twitter,
    # so we get the latest results for the query instead of those in the local cache.
    for tweet in engine.search(twitter_subject, count=275, cached=False):
Exemple #25
0
# because a query is instant when it is executed the second time.
for status in fb.search("horrible", count=25, cached=False):
    print "=" * 100
    print status.id
    print status.text
    print status.author # Yields an (id, name)-tuple.
    print status.date
    print status.likes
    print status.comments
    print
    # Only add the tweet to the table if it doesn't already exists.
    if len(table) == 0 or status.id not in index:
        table.append([status.id, status.text])
        index.add(status.id)

table.save("opinions.txt")

# 2) Status updates from specific profiles.
#    For this you need a personal license key:
#    http://www.clips.ua.ac.be/media/pattern-fb.html

license = ""

if license != "":
    fb = Facebook(license)
    # Facebook.profile() returns an (id, name, date of birth, gender, locale, likes)-tuple.
    # By default, this is your own profile. 
    # You can also supply the id of another profile, 
    # or the name of a product page.
    me = fb.profile()[0]
    for status in fb.search(me, type=NEWS, count=30, cached=False):
Exemple #26
0
engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description) # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = str(hash(tweet.author + tweet.description))
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.csv")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
# We can also open the table later on, in other scripts, for further analysis.
#pprint(table)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

prev = '1071765537749917696'

counter = 0

while counter < 1000:

    counter += 1
    time.sleep(60)
    for tweet in engine.search("#Apple", start=prev, count=10, cached=False):
        print(tweet.id)
        #        print(tweet.text)
        #        print(tweet.date)
        tweet_sentiment = sentiment(tweet.text)
        print(tweet_sentiment)

        if len(table) == 0 or tweet.id not in index:

            table.append([tweet.id, tweet.date, tweet.text, tweet_sentiment])
            index.add(tweet.id)

        prev = tweet.id

table.save(pd("tweets2.csv"))
csv = Datasheet()

for word, pos in lexicon.items():

    if " " not in word:

        f = frequency.get(word, frequency.get(word.lower(), 0))

        # Map to Penn Treebank II tagset.

        penn = [PENN[tag] for tag in pos if tag in PENN]
        penn += [tag] if tag in ("SYM", ".", ",", ":", "\"", "(", ")", "#",
                                 "$") else []
        penn = ", ".join(penn)

        # Collect tagged words in the .csv file.

        csv.append((f, word, penn))

        # Collect special words for post-processing.

        for tag in SPECIAL:

            if tag in pos:
                special.add(word)

csv.columns[0].sort(reverse=True)
csv.save("it-lexicon.csv")

print special
fb = Facebook()

# With cached=False, a live request is sent to Facebook,
# so we get the latest results for the query instead of those in the local cache.
for status in fb.search("horrible", count=25, cached=False):
    print status.text
    print status.author
    print status.date
    print
    id = status.id
    # Only add the status update to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, status.text])
        index[id] = True

table.save("negative.txt")

# 2) Status updates from profiles.

# You need a personal license key first:
# http://www.clips.ua.ac.be/media/pattern-fb.html
license = ""

if license != "":
    fb = Facebook(license)
    # Facebook.profile() returns an (id, name, date of birth, gender, locale)-tuple.
    # By default, this is your own profile. 
    # You can also supply the id of another profile.
    me = fb.profile()[0]
    for status in fb.search(me, type=NEWS, count=10, cached=False):
        print status.id   # Status update unique ID.
Exemple #30
0
engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description)  # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = hash(tweet.author + tweet.description)
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.txt")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
# We can also open the table later on, in other scripts, for further analysis.
#pprint(table)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
                    score = review.by_class("swSprite")[0]
                    score = score.attributes["title"]
                    score = score.split(" ")[0]
                    score = float(score)
                
                    # The review is contained as plain text in the <div>.
                    text = ""
                    for child in review.children:
                        if child.type == "text":
                            text += child.source + " "
                    text = text.strip()
                    text = plaintext(text) # Remove HTML entities, tags, etc.
                    
                    if text:
                        corpus.append((text, score))
                        print score
                        print text
                        print
                
                except Exception, e:
                    #print e
                    pass

        # Now and then, save the corpus of (review, score) items as a .csv file.
        corpus.save("books-fr.csv")
        
# Can you think of other test data to mine for?
# Can you see why it would be useful to have different test sets?
# - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag?
# - How about hotel reviews + star rating from http://fr.hotels.com?
# - ...
for status in fb.search("horrible", count=25, cached=False):
    print("=" * 100)
    print(status.id)
    print(status.text.encode("utf-8"))
    print(status.author)  # Yields an (id, name)-tuple.
    print(status.date)
    print(status.likes)
    print(status.comments)
    print()
    # Only add the tweet to the table if it doesn't already exists.
    if len(table) == 0 or status.id not in index:
        table.append([status.id, status.text])
        index.add(status.id)

# Create a .csv in pattern/examples/01-web/
table.save(pd("opinions.csv"))

# 2) Status updates from specific profiles.
#    For this you need a personal license key:
#    http://www.clips.ua.ac.be/pattern-facebook

license = ""

if license != "":
    fb = Facebook(license)
    # Facebook.profile() returns a dictionary with author info.
    # By default, this is your own profile.
    # You can also supply the id of another profile,
    # or the name of a product page.
    me = fb.profile()["id"]
    for status in fb.search(me, type=NEWS, count=30, cached=False):
                               cached=False):
        print("")
        print(tweet.text)
        # print(tweet.author)
        print(tweet.date)
        print(hashtags(tweet.text))  # Keywords in tweets start with a "#".
        print("")
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("black.csv"))

print("Total results: %s" % len(table))
print("")

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further analysis, ...

pprint(table, truncate=100)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")

# Import util libraries
import tweepy