def main(): logging.basicConfig(level=logging.INFO) argparser = ArgumentParser(description=__doc__) argparser.add_argument("-t", "--trainset", action="store", default=None, help=("Path to training data " "[default: %(default)s]")) argparser.add_argument("-m", "--model", action="store", help="Path to model") argparser.add_argument("-d", "--dump", action="store_true", help="Pickle trained model? [default: False]") argparser.add_argument("-v", "--verbose", action="store_true", default=False, help="Verbose [default: quiet]") argparser.add_argument("-c", "--classify", action="store", default=None, help=("Path to data to classify " "[default: %(default)s]")) argparser.add_argument("-s", "--save", action="store", default='output.csv', help=("Path to output file" "[default = output.csv]")) args = argparser.parse_args() clf = SensationalismClassifier(train_data=args.trainset, model=args.model, dump=args.dump, debug=args.verbose) if args.classify: OUTPUT_PATH = args.save if clf.debug: tick = time() to_classify = Datasheet.load(args.classify) classified_data = clf.classify(to_classify) output = Datasheet(classified_data) output.save(pd(OUTPUT_PATH)) if clf.debug: sys.stderr.write("\nProcessed %d items in %0.2fs" % (len(classified_data), time() - tick))
def scrape_news_text(news_url): global counter news_html = requests.get(news_url).content # print(news_html) '''convert html to BeautifulSoup object''' news_soup = BeautifulSoup(news_html, 'lxml') # soup.find("div", {"id": "articlebody"}) # paragraphs = [par.text for par in news_soup.find_all('p')] # news_text = '\n'.join(paragraphs) # print(news_soup.find("div", {"id": "articleText"})) date_object = news_soup.find(itemprop="datePublished") news_object = news_soup.find("div", {"id": "articleText"}) if date_object is None: return " " if news_object is None: return " " news_date = date_object.get_text( ) # find("div", {"id": "articleText"}).text news_text = news_object.text # print(news_date) # print(news_text) print(news_url) try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("nasdaq2.csv")) except: table = Datasheet() news_sentiment = sentiment(news_text) print(news_sentiment) table.append([counter, news_date, news_url, news_sentiment]) table.save(pd("nasdaq2.csv")) counter += 1 return news_text
def main(): table = Datasheet() for cap in CAPS: url = URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap) connection = url.open() doc = Document( connection.read() ) items = doc.by_tag("table") row = [] for j, td in enumerate( items[5].by_tag('td') ): strcap = "%s, Telefono:" % cap save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n" if save != None: row.append( save ) table.append( row ) print "%s ----------------------------------------------------------------------------" % str(j) table.save("files/farmacie_torino.txt")
def main(): table = Datasheet() url = URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html") connection = url.open() doc = Document( connection.read() ) items = doc.by_class('ulamm')[1:] row = [] for ul in items: li = ul.by_tag('li') kind = plaintext(ul.previous.content) for el in li: if el != None: save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, ) row.append(save) table.append( row ) table.save("files/h_torino.txt")
def dumpdata(self): ''' Utility method to dump data in a csv file for later upload to the final database. Final database fields is found below. --------------------------------------------------------------------- 1.) text = models.CharField(max_length=200) 2.) owner = models.CharField(max_length=20) 3.) label = models.CharField(max_length=20) 4.) usage = models.CharField(max_length=20) 5.) disease_type = models.CharField(max_length=20, null=True) 6.) urlentity = models.CharField(max_length=20) 7.) hashtagentity = models.CharField(max_length=20) 8.) tweet_time = models.DateTimeField(db_index=True, default=datetime.now) 9.) location= models.ForeignKey(Location, null=True, blank=True) 10.) location_string = models.CharField(max_length=20, null=True) 11.) from_lang = models.CharField(max_length=20) 12.) lat 13.) lng 14.) country ''' try: # We extract information from database and store in a csv data_dump = Datasheet.load("corpora/twitter/datadump2.csv") index = dict.fromkeys(data_dump[0], True) except: data_dump = Datasheet() index = {} for tweet in epi.models.Tweet.objects.all(): id = str(hash(tweet.owner + tweet.text)) if len(data_dump) == 0 or id not in index: data_dump.append([id, tweet.text, tweet.owner, tweet.label, \ tweet.usage, '', tweet.urlentity, tweet.tweet_time,\ '', tweet.location, '']) index[id] = True data_dump.save("corpora/twitter/datadump2.csv")
def classify(self, document): ''' This method is used to classify new documents. Uses the saved model. ''' #Loading csv predictions and corpora documents. try: nb_predictions = Datasheet.load("predictions/NB/patterns_nb.csv") nb_corpus = Datasheet.load("corpora/NB/nb.csv") index_pred = dict.fromkeys(nb_predictions.columns[0], True) index_corp = dict.fromkeys(nb_corpus.columns[0], True) except: nb_predictions = Datasheet() nb_corpus = Datasheet() index_pred = {} index_corp = {} #Load model from file system classifier = Classifier.load('models/nb_model.ept') label = classifier.classify(Document(document)) probability = classifier.classify(Document(document), discrete=False)[label] id = str(hash(label + document)) if ("positive" in label): if len(nb_predictions) == 0 or id not in index_pred: nb_predictions.append([id, label, document, probability]) index_pred[id] = True if len(nb_corpus) == 0 or id not in index_corp: nb_corpus.append([id, label, document, probability]) index_corp[id] = True nb_predictions.save("predictions/NB/patterns_nb.csv") nb_corpus.save("corpora/NB/nb.csv") return label
def classify(self, document): ''' This method is used to classify new documents. Uses the saved model. ''' #Loading csv predictions and corpora documents. try: svm_predictions = Datasheet.load("predictions/svm.csv") svm_corpus = Datasheet.load("corpora/svm/svm.csv") index_pred = dict.fromkeys(svm_predictions.columns[0], True) index_corp = dict.fromkeys(svm_corpus.columns[0], True) except: svm_predictions = Datasheet() svm_corpus = Datasheet() index_pred = {} index_corp = {} #Load model from file system classifier = Classifier.load('models/svm_model2.ept') label = classifier.classify(Document(document)) id = str(hash(label + document)) if ("positive" in label): if len(svm_predictions) == 0 or id not in index_pred: svm_predictions.append([id, label, document]) index_pred[id] = True if len(svm_corpus) == 0 or id not in index_corp: svm_corpus.append([id, label, document]) index_corp[id] = True svm_predictions.save("predictions/svm.csv") svm_corpus.save("corpora/svm/svm.csv") return label
id = None for i in range(3): # Look for tweets containing the search query. # We can get a maximum of 100 tweets per search. # Don't cache the results locally, # so we get the latest new tweets when the script runs. # Do this 3x. for tweet in twitter.search(q, start=id, count=100, cached=False): id = tweet.id if id not in seen: print tweet.text print seen.add(id) csv.append([ tweet.id, q, tweet.author, tweet.text, tweet.retweets, tweet.date ]) # Update the CSV. csv.save(PATH) # Each time you run the script, new tweets will be appended to the CSV. # For example, we have Twitter miners that automatically run 10x each day, # and have been running for many days and weeks. # We can then import the data in other scripts, e.g.: #from pattern.db import Datasheet, pd #csv = Datasheet.load(pd("tweets.csv")) #for id, q, author, text, retweets, date in csv: # print text
score = review.by_class("swSprite")[0] score = score.attributes["title"] score = score.split(" ")[0] score = float(score) # The review is contained as plain text in the <div>. text = "" for child in review.children: if child.type == "text": text += child.source + " " text = text.strip() text = plaintext(text) # Remove HTML entities, tags, etc. if text: corpus.append((text, score)) print score print text print except Exception, e: #print e pass # Now and then, save the corpus of (review, score) items as a .csv file. corpus.save("books-fr.csv") # Can you think of other test data to mine for? # Can you see why it would be useful to have different test sets? # - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag? # - How about hotel reviews + star rating from http://fr.hotels.com? # - ...
# In the first column, we'll store a unique ID for each tweet. # We only want to add the latest facebook status, i.e. those we haven't previously encountered. # With an index on the first column we can quickly check if an ID already exists. # The index becomes important once more and more rows are added to the table (speed). table = Datasheet.load("travel.txt") index = dict.fromkeys(table.columns[0], True) except: table = Datasheet() index = {} engine = Facebook() # With cached=False, a live request is sent to Facebook, # so we get the latest results for the query instead of those in the local cache. for status in engine.search("Travelling to", count=25, cached=False): print status.description print status.author print status.date print id = status.url # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, status.description]) index[id] = True table.save("travel.txt") print "Total results:", len(table) print
index2 = dict.fromkeys(table.columns[1], True) except: table = Datasheet() index = {} index2 = {} engine = Twitter(language="en") comparray=[" "] #spam filter # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for i in range(1, 10000): for tweet in Twitter().search('volcano OR Sicily AND etna +exclude:retweets', start=i, count=100): comparray.append(tweet.text[0:15]) print tweet.text print tweet.author print tweet.date print hashtags(tweet.text) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = str(hash(tweet.author + tweet.text)) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index and sentiment(tweet.text)[0]!= 0 and comparray[-1]!=comparray[-2]: table.append([id,tweet.author, tweet.date, tweet.text, sentiment(tweet.text)[0]]) index[id] = True table.save("tweets.csv") print "Total results:", len(table) print
for i in range(2): print(i) for tweet in engine.search("eulogy", start=prev, count=25, cached=False): print("") print(tweet.text) print(tweet.author) print(tweet.date) print(hashtags(tweet.text)) # Keywords in tweets start with a "#". print("") # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.text]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("eulogy_july_21.csv")) print("Total results: %s" % len(table)) print("") # Print all the rows in the table. # Since it is stored as a CSV-file it grows comfortably each time the script runs. # We can also open the table later on: in other scripts, for further analysis, ... pprint(table, truncate=100) # Note: you can also search tweets by author: # Twitter().search("from:tom_de_smedt")
("id", INTEGER), # Define the column headers. ("name", STRING), ("type", STRING) ]) print ds.rows[0] # A list of rows. print ds.columns[1] # A list of columns, where each column is a list of values. print ds.name print # Columns can be manipulated directly like any other Python list. # This can be slow for large tables. If you need a fast way to do matrix math, # use numpy (http://numpy.scipy.org/) instead. # The purpose of Table is data storage. ds.columns.append([ "green", "purple", "white", "yellow" ], field=("color", STRING)) # Save as a comma-separated (unicode) text file. ds.save("food.txt", headers=True) # Load a table from file. ds = Datasheet.load("food.txt", headers=True) pprint(ds, truncate=50, padding=" ", fill=".") print print ds.fields
engine = Twitter(language="en") # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = str(hash(tweet.author + tweet.description)) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, tweet.description]) index[id] = True table.save("cool.csv") print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a file it can grow comfortably each time the script runs. # We can also open the table later on, in other scripts, for further analysis. #pprint(table) # Note: you can also search tweets by author: # Twitter().search("from:tom_de_smedt")
engine = Twitter(language="en") # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = hash(tweet.author + tweet.description) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, tweet.description]) index[id] = True table.save("cool.txt") print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a file it can grow comfortably each time the script runs. # We can also open the table later on, in other scripts, for further analysis. #pprint(table) # Note: you can also search tweets by author: # Twitter().search("from:tom_de_smedt")
fb = Facebook() # With cached=False, a live request is sent to Facebook, # so we get the latest results for the query instead of those in the local cache. for status in fb.search("horrible", count=25, cached=False): print status.text print status.author print status.date print id = status.id # Only add the status update to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, status.text]) index[id] = True table.save("negative.txt") # 2) Status updates from profiles. # You need a personal license key first: # http://www.clips.ua.ac.be/media/pattern-fb.html license = "" if license != "": fb = Facebook(license) # Facebook.profile() returns an (id, name, date of birth, gender, locale)-tuple. # By default, this is your own profile. # You can also supply the id of another profile. me = fb.profile()[0] for status in fb.search(me, type=NEWS, count=10, cached=False): print status.id # Status update unique ID.
__author__ = 'Nitin' from collocations import get_knowledge_from_collocations from associations import get_knowledge_from_associations from common_sense import get_knowledge_from_pattern_common_sense from wordnet_nyms import get_knowledge_from_wordnet from pattern.db import Datasheet import logging logging.basicConfig(level=logging.INFO) logging.getLogger(__name__) g = [] get_knowledge_from_collocations(g) get_knowledge_from_associations(g) get_knowledge_from_pattern_common_sense(g) #get_knowledge_from_wordnet(g) knowledge = [tuple([head.strip(), tail.strip(), relation]) for head, tail, relation in g if '.' in head and '.' in tail and '/' not in head and '/' not in tail and '(' not in head and '(' not in tail and ')' not in head and ')' not in tail and not head.startswith('.') and not tail.startswith('.')] logging.info('Memorising...') ds = Datasheet() for speck in knowledge: ds.append(speck) ds.save('knowledge.csv')
#print adjectives # We now want to sort the dictionary by frequency. # The items() method of a Python dictionary returns a list of (key, value)-tuples. # In our case, (lemma, [frequency, [form1, form2, ...]]), for example: # ("beau", [620.07, ["beau", "beaux", "belle", "belles"]]) # We'll make a new list with the frequency at the start of each tuple. # We can then sort the list by frequency. adjectives = adjectives.items() adjectives = [(weight, lemma, forms) for lemma, (weight, forms) in adjectives] adjectives = sorted(adjectives, reverse=True) # Highest-first. #print adjectives # We want to save our list of adjectives as a new corpus. # Something more manageable than 24MB. # I prefer a new .csv file with two fields: lemma, and forms (comma-separated). # Adjectives higher up in the list are more frequent, # we should deal with those first to get a good coverage. corpus = Datasheet() for frequency, lemma, forms in adjectives: field1 = lemma field2 = ",".join(forms) # Collapse list to comma-separated string. corpus.append( [field1, field2] ) corpus.save("adj-fr.csv") # We end up with a 500KB list of words commonly used to express emotion or opinion, # sorted by how often they occur in books, # along with their inflected forms (gender/number, such as "belles"). # The top 10 most frequent adjectives are: # "tout", "petit", "grand", "seul", "autre", "meme", "bon", "premier", "beau", "jeune", ...
# 4) Remove HTML tags: try: s = plaintext(e) s = s.strip() except: continue #if not s: # print r.url # print # continue # 5) Save to CSV: if r.url not in seen: seen.add(r.url) csv.append(( name, r.title, s, label, )) print(name, r.title) csv.save(pd(PATH)) # To read the dataset: #for name, label, article in Datasheet.load(PATH): #print(name, label, article) #datasheet = Datasheet.load(PATH) #print(datasheet)
tokens = [] for i in range(0,len(word_tokenize(tweet))): if word_tokenize(tweet)[i] == '@': tokens.append( str('@' + word_tokenize(tweet)[i+1]) ) if word_tokenize(tweet)[i] == '#': tokens.append( str('#' + word_tokenize(tweet)[i+1]) ) new_bigrams = nltk.bigrams(tokens) for bigram in new_bigrams: bigram_table.append(bigram) for token in tokens: all_tokens.extend(tokens) token_freq = nltk.FreqDist(all_tokens) str_today = 'tweet_graph_' + str(date.today().day) + '-' + str(date.today().month) + '-' + str(date.today().year) + '.csv' bigram_table.save(str_today) new_twitter_subjects = list(set(all_tokens)) another_table = Datasheet() # save the original list of twitter users, we'll use this in cytoscape spamWriter = csv.writer(open('original_twitter.csv', 'wb'), delimiter=' ', quotechar='|') for i in list(set(all_tokens)): spamWriter.writerow([i, 1]) for twitter_subject in new_twitter_subjects: # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search(twitter_subject, count=275, cached=False): # Create a unique ID based on the tweet content and author. id = hash(tweet.author + tweet.description)
seen = set() twitter = Twitter(language="en", license=None) for name in celebrities(): id = None for tweet in twitter.search(name, start=id, count=100, cached=False): id = tweet.id if id not in seen: print name, tweet.text print seen.add(id) for w in adjectives(tweet.text): if not w.startswith(("@", "~", "1", "2")): # filter out weirdness csv.append([tweet.id, name, w]) csv.save(PATH) # ------------------------------------------------------------------------------------ # Dataset reader. PATH = pd("properties.csv") f = {} # {celebrity: {property: count}} for id, name, p in Datasheet.load(PATH): if name not in f: f[name] = {} # {"Justin Bieber": {}} if p not in f[name]: f[name][p] = 0 # {"Justin Bieber": {"gay": 0}} f[name][p] += 1 # {"Justin Bieber": {"gay": 1}} #print f["Eminem"]
print(i) for tweet in engine.search("is cooler than", start=prev, count=25, cached=False): print() print(tweet.text.encode("utf-8")) print(tweet.author) print(tweet.date) print(hashtags(tweet.text)) # Keywords in tweets start with a "#". print() # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.text]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("cool.csv")) print("Total results:", len(table)) print() # Print all the rows in the table. # Since it is stored as a CSV-file it grows comfortably each time the script runs. # We can also open the table later on: in other scripts, for further # analysis, ... pprint(table, truncate=100) # Note: you can also search tweets by author: # Twitter().search("from:tom_de_smedt")
prev = None print "processing word:",word for tweet in twitter.search(word,start=prev,cached=False,count=200): # print # # print tweet.text # print tweet.author # print tweet.date # print hashtags(tweet.text) # # print clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet.text).split()) if tweet.id not in index and clean_text not in texts: table.append([tweet.id,tweet.text,clean_text,hashtags(tweet.txt)]) index.add(tweet.id) texts.add(clean_text) prev = tweet.id # table.save(pd("tweets_threats.csv")) # pprint(table,truncate=100)
for i in range(0, len(word_tokenize(tweet))): if word_tokenize(tweet)[i] == '@': tokens.append(str('@' + word_tokenize(tweet)[i + 1])) if word_tokenize(tweet)[i] == '#': tokens.append(str('#' + word_tokenize(tweet)[i + 1])) new_bigrams = nltk.bigrams(tokens) for bigram in new_bigrams: bigram_table.append(bigram) for token in tokens: all_tokens.extend(tokens) token_freq = nltk.FreqDist(all_tokens) str_today = 'tweet_graph_' + str(date.today().day) + '-' + str( date.today().month) + '-' + str(date.today().year) + '.csv' bigram_table.save(str_today) new_twitter_subjects = list(set(all_tokens)) another_table = Datasheet() # save the original list of twitter users, we'll use this in cytoscape spamWriter = csv.writer(open('original_twitter.csv', 'wb'), delimiter=' ', quotechar='|') for i in list(set(all_tokens)): spamWriter.writerow([i, 1]) for twitter_subject in new_twitter_subjects: # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search(twitter_subject, count=275, cached=False):
# because a query is instant when it is executed the second time. for status in fb.search("horrible", count=25, cached=False): print "=" * 100 print status.id print status.text print status.author # Yields an (id, name)-tuple. print status.date print status.likes print status.comments print # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or status.id not in index: table.append([status.id, status.text]) index.add(status.id) table.save("opinions.txt") # 2) Status updates from specific profiles. # For this you need a personal license key: # http://www.clips.ua.ac.be/media/pattern-fb.html license = "" if license != "": fb = Facebook(license) # Facebook.profile() returns an (id, name, date of birth, gender, locale, likes)-tuple. # By default, this is your own profile. # You can also supply the id of another profile, # or the name of a product page. me = fb.profile()[0] for status in fb.search(me, type=NEWS, count=30, cached=False):
index = set(table.columns[0]) except: table = Datasheet() index = set() engine = Twitter(language="en") prev = '1071765537749917696' counter = 0 while counter < 1000: counter += 1 time.sleep(60) for tweet in engine.search("#Apple", start=prev, count=10, cached=False): print(tweet.id) # print(tweet.text) # print(tweet.date) tweet_sentiment = sentiment(tweet.text) print(tweet_sentiment) if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.date, tweet.text, tweet_sentiment]) index.add(tweet.id) prev = tweet.id table.save(pd("tweets2.csv"))
csv = Datasheet() for word, pos in lexicon.items(): if " " not in word: f = frequency.get(word, frequency.get(word.lower(), 0)) # Map to Penn Treebank II tagset. penn = [PENN[tag] for tag in pos if tag in PENN] penn += [tag] if tag in ("SYM", ".", ",", ":", "\"", "(", ")", "#", "$") else [] penn = ", ".join(penn) # Collect tagged words in the .csv file. csv.append((f, word, penn)) # Collect special words for post-processing. for tag in SPECIAL: if tag in pos: special.add(word) csv.columns[0].sort(reverse=True) csv.save("it-lexicon.csv") print special
for status in fb.search("horrible", count=25, cached=False): print("=" * 100) print(status.id) print(status.text.encode("utf-8")) print(status.author) # Yields an (id, name)-tuple. print(status.date) print(status.likes) print(status.comments) print() # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or status.id not in index: table.append([status.id, status.text]) index.add(status.id) # Create a .csv in pattern/examples/01-web/ table.save(pd("opinions.csv")) # 2) Status updates from specific profiles. # For this you need a personal license key: # http://www.clips.ua.ac.be/pattern-facebook license = "" if license != "": fb = Facebook(license) # Facebook.profile() returns a dictionary with author info. # By default, this is your own profile. # You can also supply the id of another profile, # or the name of a product page. me = fb.profile()["id"] for status in fb.search(me, type=NEWS, count=30, cached=False):
cached=False): print("") print(tweet.text) # print(tweet.author) print(tweet.date) print(hashtags(tweet.text)) # Keywords in tweets start with a "#". print("") # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.text]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("black.csv")) print("Total results: %s" % len(table)) print("") # Print all the rows in the table. # Since it is stored as a CSV-file it grows comfortably each time the script runs. # We can also open the table later on: in other scripts, for further analysis, ... pprint(table, truncate=100) # Note: you can also search tweets by author: # Twitter().search("from:tom_de_smedt") # Import util libraries import tweepy