コード例 #1
0
ファイル: createSample.py プロジェクト: kearnsw/Twitt.IR
from Request import Request

# Create request object to handle user input.
q = Request()
months = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5,
          "Jun": 6, "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10,
          "Nov": 11, "Dec": 12}
date = q.date.split()
month = date[0]
day = date[1]

# connect to Mongo and search based on criteria
q.connect()
criteria = {"lang": "en", "created_at": {'$regex': q.date},
            "text": {"$not": re.compile("RT")}}
cursor = q.search(criteria, {"text": 1})

# load tweet with id
corpus = []
ids = []
tweet_filter = Filter(25)
for document in cursor:
    text = ' '.join(document["text"].encode("utf-8").split())
    corpus.append(text)
    ids.append(document["_id"])

# filter repeated tweets
t0 = time()
i = 0
status = -1
unique_tweets = ["Dummy Tweet"]
コード例 #2
0
ファイル: dump.py プロジェクト: kearnsw/Twitt.IR
        return json.JSONEncoder.default(self, o)

# Create request object to handle user input.
q = Request()
months = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5,
          "Jun": 6, "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10,
          "Nov": 11, "Dec": 12}
date = q.date.split()
month = date[0]
day = date[1]

# connect to Mongo and search based on criteria
q.connect()
criteria = {"lang": "en", "created_at": {'$regex': q.date},
            "text": {"$not": re.compile("RT")}}
cursor = q.search(criteria).limit(1000)

# Open file I/O streams
directory = os.path.dirname(os.getcwd())
fn = "sample_" + str(months[month]) + "_" + str(day) + ".json"
f = open(directory + "/data/" + fn, "w+")

# load tweet with id
corpus = [{"text": "dummy"}]
tweetFilter = Filter(45)
i = 0
print("Filtering Results...")
for document in cursor:
    document["_id"] = str(document["_id"])
    document["text"] = document["text"].replace('"', "'")
    for tweet in corpus: