for document in cursor: text = ' '.join(document["text"].encode("utf-8").split()) corpus.append(text) ids.append(document["_id"]) # filter repeated tweets t0 = time() i = 0 status = -1 unique_tweets = ["Dummy Tweet"] length = len(corpus) print("Filtering tweets may take a few minutes...") for document in corpus: for tweet in unique_tweets: status = tweet_filter.check_duplicates(document, tweet) if status: break if not status: unique_tweets.append(document) i += 1 if i > 3000: break print("done in %0.3fs." % (time() - t0)) unique_tweets.pop(0) corpus = unique_tweets # create sample by bootstrap sampling random_indices = random.sample(range(0, len(corpus)), q.num_of_docs) # Open file I/O streams
# Open file I/O streams directory = os.path.dirname(os.getcwd()) fn = "sample_" + str(months[month]) + "_" + str(day) + ".json" f = open(directory + "/data/" + fn, "w+") # load tweet with id corpus = [{"text": "dummy"}] tweetFilter = Filter(45) i = 0 print("Filtering Results...") for document in cursor: document["_id"] = str(document["_id"]) document["text"] = document["text"].replace('"', "'") for tweet in corpus: # If return a match then append to unique tweets status = tweetFilter.check_duplicates(document["text"], tweet["text"]) if status: break if not status: corpus.append(document["text"]) i += 1 if i >= 100: break print(i) # Remove header corpus.pop(0) json.dump(corpus, f, indent=1)