class Consumer(multiprocessing.Process): def __init__(self, task_queue, host, port, db, collection): self.coll = MongoClient(host, port)[db][collection] self.mongo_bulk = self.coll.initialize_unordered_bulk_op() multiprocessing.Process.__init__(self) self.task_queue = task_queue def run(self): count = 0 def execute_mongo_bulk(): try: self.mongo_bulk.execute() self.mongo_bulk = self.coll.initialize_unordered_bulk_op() logger.info("executing mongo bulk") except Exception, e: logger.info(e) while True: next_task = self.task_queue.get() if next_task is None: self.task_queue.task_done() break next_task(self.mongo_bulk) if count % 1000 == 0: execute_mongo_bulk() self.task_queue.task_done() execute_mongo_bulk() return
def worker(identifier, skip, count): stopwords = load_stopwords() reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][ Settings.REVIEWS_COLLECTION] tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][ Settings.REVIEWS_COLLECTION] # Tags creation and storage is insert heavey operation , so Adding data in Bulk print tags_collection bulk = tags_collection.initialize_unordered_bulk_op(); bulkCounter = 0 counter = 0 batch_size = 1000 for batch in range(0, count, batch_size): reviews_cursor = reviews_collection.find().skip(skip + batch).limit(batch_size) for review in reviews_cursor: words = [] sentences = nltk.sent_tokenize(review["text"].lower()) for sentence in sentences: tokens = nltk.word_tokenize(sentence) text = [word for word in tokens if word not in stopwords] tagged_text = nltk.pos_tag(text) for word, tag in tagged_text: words.append({"word": word, "pos": tag}) bulk.insert({ "rating" : review ["rating"], "userId" : review ["userId"], "reviewId": review["reviewId"], "business": review["business"], "text": review["text"], "words": words }) bulkCounter=bulkCounter + 1 counter = counter + 1 if bulkCounter % 10000 == 0 : bulk.execute() print str(counter) + 'Entries are inserted in Thread' + str(multiprocessing.current_process()) bulkCounter=0 bulk= tags_collection.initialize_unordered_bulk_op() if bulkCounter > 0 : bulk.execute()
from pymongo import MongoClient from nltk.stem.wordnet import WordNetLemmatizer from settings import Settings tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][Settings.REVIEWS_COLLECTION] corpus_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][Settings.CORPUS_COLLECTION] reviews_cursor = tags_collection.find() reviewsCount = reviews_cursor.count() reviews_cursor.batch_size(5000) lem = WordNetLemmatizer() ## Do Bulk insertion bulk = corpus_collection.initialize_unordered_bulk_op() bulkCounter=0 counter=0 for review in reviews_cursor: nouns = [] words = [word for word in review["words"] if word["pos"] in ["NN", "NNS"]] for word in words: nouns.append(lem.lemmatize(word["word"])) bulk.insert({ "userId" : review["userId"], "rating" : review["rating"], "reviewId": review["reviewId"], "business": review["business"], "text": review["text"],
import json import sys sys.path.insert(0, '/home/ish/DataScience/yelp_topic') from pymongo import MongoClient from settings import Settings dataset_file = Settings.BUSINESS_DATASET_FILE reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][ Settings.BUSINESS_COLLECTION] # Bulk insertion is used to make insertion faster bulk= reviews_collection.initialize_unordered_bulk_op() bulkCounter=0 counter=0 #Read each line and insert data in MongoDB in Json format with open(dataset_file) as dataset: next(dataset) for line in dataset: try: data = json.loads(line) except ValueError: print 'Oops!' if data["type"] == "business": bulk.insert({ "business_id":data["business_id"], "name": data["name"], "neighborhoods": data["neighborhoods"], "full_address": data["full_address"],
reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][ Settings.REVIEWS_COLLECTION] tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][Settings.REVIEWS_COLLECTION] reviews_cursor = reviews_collection.find() reviewsCount = reviews_cursor.count() reviews_cursor.batch_size(1000) stopwords = {} with open('stopwords.txt', 'rU') as f: for line in f: stopwords[line.strip()] = 1 bulk = tags_collection.initialize_unordered_bulk_op() bulkCounter=0 for review in reviews_cursor: words = [] sentences = nltk.sent_tokenize(review["text"].lower()) for sentence in sentences: tokens = nltk.word_tokenize(sentence) text = [word for word in tokens if word not in stopwords] tagged_text = nltk.pos_tag(text) for word, tag in tagged_text: words.append({"word": word, "pos": tag}) bulk.insert({