Esempio n. 1
0
for document in cursor:
    text = ' '.join(document["text"].encode("utf-8").split())
    corpus.append(text)
    ids.append(document["_id"])

# filter repeated tweets
t0 = time()
i = 0
status = -1
unique_tweets = ["Dummy Tweet"]
length = len(corpus)

print("Filtering tweets may take a few minutes...")
for document in corpus:
    for tweet in unique_tweets:
        status = tweet_filter.check_duplicates(document, tweet)
        if status:
            break
    if not status:
        unique_tweets.append(document)
    i += 1
    if i > 3000:
        break

print("done in %0.3fs." % (time() - t0))
unique_tweets.pop(0)
corpus = unique_tweets
# create sample by bootstrap sampling
random_indices = random.sample(range(0, len(corpus)), q.num_of_docs)

# Open file I/O streams
Esempio n. 2
0
# Open file I/O streams
directory = os.path.dirname(os.getcwd())
fn = "sample_" + str(months[month]) + "_" + str(day) + ".json"
f = open(directory + "/data/" + fn, "w+")

# load tweet with id
corpus = [{"text": "dummy"}]
tweetFilter = Filter(45)
i = 0
print("Filtering Results...")
for document in cursor:
    document["_id"] = str(document["_id"])
    document["text"] = document["text"].replace('"', "'")
    for tweet in corpus:
        # If return a match then append to unique tweets
        status = tweetFilter.check_duplicates(document["text"], tweet["text"])
        if status:
            break
    if not status:
        corpus.append(document["text"])
        i += 1
    if i >= 100:
        break
    print(i)


# Remove header
corpus.pop(0)
json.dump(corpus, f, indent=1)