class NiceTestCase(unittest.TestCase): def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.source = Source(host="localhost",port=27017,database='reddit_stream_test',collection='combined') def tearDown(self): pass def test_no_bad_characters(self): print("\n") remov = re.compile("[0-9]") for doc in html.doc_iter(self.source.find().limit(1000)): print("".join(list(filter(lambda x : x in string.printable, doc["cleansed_text"]))))
def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.source = Source(host="localhost",port=27017,database='reddit_stream_test',collection='combined')
from rdt.data.mongo.source import Source from rdt.data.mongo.bulkinserter import BulkInserter if __name__ == "__main__": source = Source(host="localhost",port=27017,database="reddit_stream",collection="combined") with BulkInserter(source=Source(host="localhost",port=27017,database='reddit_stream_test',collection='load')) as bulk: for doc in source.find_clean(batch_size=1000,limit=2000): del doc["_id"] bulk.insert(doc)
import apple_label as label from rdt.data.mongo.source import Source if __name__ == "__main__": is_good = Source(host="localhost", port=27017, database='reddit_stream_test',collection='is_good') is_bad = Source(host="localhost", port=27017, database='reddit_stream_test', collection='is_bad') print( "") print( "START!\n") try: utc = int(input("Enter a unix time stamp\nx > 0 to skip to time\nx < 1 to start at beginning\nx = ")) if utc < 1: print( "dropping " + repr(is_apple)) print( "is_apple.find().count(): " + str(is_good.count())) print( "dropping " + repr(is_not_apple)) print("is_apple.find().count(): " + str(is_bad.count())) print("") print("Starting at utc " + str(utc)) print("Hello, determine if they comments relate to Apple Inc!") print("---------------------") label.apple_finder(utc,maybe_apple, is_apple, is_not_apple) except ValueError: "Give me an int" except: "you pooped up"
def subreddit(subreddit=None,batch_size=100): if subreddit is None: return None source = Source(host="localhost",port=27017,database="reddit_stream",collection="combined") cursor = source.find_clean({"subreddit" : subreddit},batch_size=batch_size) return cursor