Ejemplo n.º 1
0
class Consumer(multiprocessing.Process):
    def __init__(self, task_queue, host, port, db, collection):
        self.coll = MongoClient(host, port)[db][collection]
        self.mongo_bulk = self.coll.initialize_unordered_bulk_op()
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue

    def run(self):
        count = 0

        def execute_mongo_bulk():
            try:
                self.mongo_bulk.execute()
                self.mongo_bulk = self.coll.initialize_unordered_bulk_op()
                logger.info("executing mongo bulk")
            except Exception, e:
                logger.info(e)

        while True:
            next_task = self.task_queue.get()
            if next_task is None:
                self.task_queue.task_done()
                break

            next_task(self.mongo_bulk)
            if count % 1000 == 0:
                execute_mongo_bulk()
            self.task_queue.task_done()

        execute_mongo_bulk()
        return
def worker(identifier, skip, count):

    stopwords = load_stopwords()
    reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][
        Settings.REVIEWS_COLLECTION]
    tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][
        Settings.REVIEWS_COLLECTION]

    # Tags creation and storage is insert heavey operation , so Adding data in Bulk
    print tags_collection
    bulk = tags_collection.initialize_unordered_bulk_op();
    bulkCounter = 0
    counter = 0
    batch_size = 1000
    
    for batch in range(0, count, batch_size):
        reviews_cursor = reviews_collection.find().skip(skip + batch).limit(batch_size)
        for review in reviews_cursor:
            words = []
            sentences = nltk.sent_tokenize(review["text"].lower())
            for sentence in sentences:
                tokens = nltk.word_tokenize(sentence)
                text = [word for word in tokens if word not in stopwords]
                tagged_text = nltk.pos_tag(text)
                
                for word, tag in tagged_text:
                    words.append({"word": word, "pos": tag})
                    
            bulk.insert({
                        "rating" : review ["rating"],
                        "userId" : review ["userId"],
                        "reviewId": review["reviewId"],
                        "business": review["business"],
                        "text": review["text"],
                        "words": words
                        })
            bulkCounter=bulkCounter + 1
            counter = counter + 1
            if bulkCounter % 10000 == 0 :
                bulk.execute()
                print str(counter) + 'Entries are inserted in Thread' + str(multiprocessing.current_process())
                bulkCounter=0
                bulk= tags_collection.initialize_unordered_bulk_op()
                    
    if bulkCounter > 0 :
        bulk.execute()
Ejemplo n.º 3
0
from pymongo import MongoClient
from nltk.stem.wordnet import WordNetLemmatizer
from settings import Settings

tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][Settings.REVIEWS_COLLECTION]
corpus_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][Settings.CORPUS_COLLECTION]

reviews_cursor = tags_collection.find()
reviewsCount = reviews_cursor.count()
reviews_cursor.batch_size(5000)

lem = WordNetLemmatizer()

## Do Bulk insertion
bulk = corpus_collection.initialize_unordered_bulk_op()
bulkCounter=0
counter=0

for review in reviews_cursor:
    nouns = []
    words = [word for word in review["words"] if word["pos"] in ["NN", "NNS"]]

    for word in words:
        nouns.append(lem.lemmatize(word["word"]))

    bulk.insert({
        "userId" : review["userId"],
        "rating" : review["rating"],
        "reviewId": review["reviewId"],
        "business": review["business"],
        "text": review["text"],
Ejemplo n.º 4
0
import json
import sys
sys.path.insert(0, '/home/ish/DataScience/yelp_topic')


from pymongo import MongoClient
from settings import Settings


dataset_file = Settings.BUSINESS_DATASET_FILE
reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][
    Settings.BUSINESS_COLLECTION]
# Bulk insertion is used to make insertion faster
bulk= reviews_collection.initialize_unordered_bulk_op()
bulkCounter=0
counter=0

#Read each line and insert data in MongoDB in Json format
with open(dataset_file) as dataset:
    next(dataset)
    for line in dataset:
        try:
            data = json.loads(line)
        except ValueError:
            print 'Oops!'
        if data["type"] == "business":
            bulk.insert({
            "business_id":data["business_id"],
            "name": data["name"],
            "neighborhoods": data["neighborhoods"],
            "full_address": data["full_address"],
Ejemplo n.º 5
0

reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][
    Settings.REVIEWS_COLLECTION]
tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TAGS_DATABASE][Settings.REVIEWS_COLLECTION]

reviews_cursor = reviews_collection.find()
reviewsCount = reviews_cursor.count()
reviews_cursor.batch_size(1000)

stopwords = {}
with open('stopwords.txt', 'rU') as f:
    for line in f:
        stopwords[line.strip()] = 1

bulk = tags_collection.initialize_unordered_bulk_op()
bulkCounter=0

for review in reviews_cursor:
    words = []
    sentences = nltk.sent_tokenize(review["text"].lower())

    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        text = [word for word in tokens if word not in stopwords]
        tagged_text = nltk.pos_tag(text)

        for word, tag in tagged_text:
            words.append({"word": word, "pos": tag})

    bulk.insert({