Ejemplo n.º 1
0
def sample_collection(**kwargs):
    """exceptions:
        fewer records than sample size
        sample size not an integer
        source does not exist
        destination already exists
        database connection exceptions
    """
    db = Repository(kwargs['host'], kwargs['database'])
    source = db.get_collection(kwargs['source'])
    destination = db.get_collection(kwargs['destination'])
    # TODO pass filter in from json file
    doc_filter = {
        'attributes': {
            '$gt': {}
        },
        'attributes.course': {
            '$nin': ['Desserts', 'Cocktails', 'Beverages']
        },
        'rating': {
            '$gte': 4,
            '$lte': 5
        }
    }

    if doc_filter:
        record_count = source.count(doc_filter)
    else:
        record_count = source.count()
    seed = kwargs['seed']
    sample_size = parse_number(kwargs['size'])

    random.seed(seed)
    to_sample = random.sample(range(0, record_count), sample_size)
    to_sample.sort()

    progress = ProgressBar(sample_size)
    progress.start()

    if doc_filter:
        cursor = source.find(doc_filter)
    else:
        cursor = source.find()

    sample_count = 0
    position = 0
    for index in to_sample:
        while position <= index:
            record = cursor.next()
            position += 1
        # TODO batch insert?
        destination.insert_one(record)
        sample_count += 1
        progress.update(sample_count)

    progress.end()
Ejemplo n.º 2
0
class IndexService:
    def __init__(self, **kwargs):
        self.host = kwargs.get('host')
        self.database = kwargs.get('database')

        self.db = Repository(self.host, self.database)
        self.recipes = self.db.get_collection(kwargs.get('recipes'))
        self.combinations = self.db.get_collection(kwargs.get('combinations'))

    def index(self):

        self.recipes.create_index('ingredients')
        self.combinations.create_index('r')
class YummlyIngredientsService:

    URL = "http://api.yummly.com/v1/api/metadata/ingredient"
    APP_ID = "2a6406ac"
    APP_KEY = "c0aac363d1a1c8b1925e5f8898c69a48"

    def __init__(self, **kwargs):
        self.host = kwargs.get('host')
        self.database = kwargs.get('database')

        self.db = Repository(self.host, self.database)
        self.collection = self.db.get_collection(kwargs.get('collection'))

        self.skip = kwargs.get("skip")

    def get_ingredients(self):

        headers = {
            'X-Yummly-App-ID': YummlyIngredientsService.APP_ID,
            'X-Yummly-App-Key': YummlyIngredientsService.APP_KEY
        }

        r = requests.get(YummlyIngredientsService.URL, headers=headers)
        response_body = r.text
        # trim jsonp  callback
        response_body = response_body.partition(',')[2][:-2]
        response_json = json.loads(response_body)

        self.collection.insert_many(response_json)
class PopulateGraphService:

    def __init__(self, **kwargs):
        self.host = kwargs.get('host')
        self.neoHost = kwargs.get('neoHost')
        self.database = kwargs.get('database')
        self.resume = kwargs.get('resume')

        self.db = Repository(self.host, self.database)
        self.recipes = self.db.get_collection(kwargs.get('recipes'))

    def populate(self):
        #TODO set uniqueness constraints if not exists
        reset_graph()

        records_count = self.recipes.count()
        output.push('Populating graph...')
        progress = ProgressBar(records_count)
        processed = 0
        progress.start()

        cursor = self.recipes.find()
        for record in cursor:

            web_id = record['id']
            recipeName = record['recipeName']
            recipe = Recipe(id=web_id)

            ingredients = []
            for ingredient_result in record['ingredients']:
                ingredients.append(Ingredient(name=ingredient_result))

            recipe.add()
            recipe.require_ingredients(ingredients)

            processed += 1
            if processed % 100 == 0:
                progress.update(processed)

        progress.update(processed)
        cursor.close()
        progress.end()
Ejemplo n.º 5
0
class LinkService:
    def __init__(self, **kwargs):
        self.host = kwargs.get('host')
        self.database = kwargs.get('database')
        self.resume = kwargs.get('resume')
        self.r_min = int(kwargs["r_min"])
        self.r_max = int(kwargs["r_max"])

        self.db = Repository(self.host, self.database)
        self.combinations = self.db.get_collection(kwargs.get('combinations'))

    def link(self):
        """exceptions:

        """

        # TODO take r from options if present

        # TODO get max/min r in combinations
        # r_max = int(self.combinations.find({}).sort([("r", -1)]).limit(1).next()['r'])
        # r_min = int(self.combinations.find({}).sort([("r", 1)]).limit(1).next()['r'])

        if self.r_min == 1:
            self.r_min += 1
            if self.r_max < self.r_min:
                output.push("No combinations linkable...")
                return
        record_filter = {"r": {"$gte": self.r_min, "$lte": self.r_max}}

        records_count = self.combinations.count(record_filter)

        progress = ProgressBar(records_count)
        processed = 0

        BULK_LIMIT = 100
        bulk = self.combinations.initialize_unordered_bulk_op()

        output.push("Linking combinations...")
        progress.start()

        cursor = self.combinations.find(record_filter, no_cursor_timeout=True)
        for combo in cursor:
            ingredients = list(combo['ingredients'])
            combo_id = combo['_id']
            score = combo['score']
            for i in range(len(ingredients)):
                givens = ingredients[:i] + ingredients[i + 1:]
                candidate = ingredients[i]

                givens.sort()
                givens_combo_id = "::".join(givens)
                bulk.find({
                    "_id": givens_combo_id
                }).update({
                    "$addToSet": {
                        "pairings": {
                            "name": candidate,
                            "score": score,
                            "ref_id": combo_id
                        }
                    }
                })

            processed += 1
            if processed % BULK_LIMIT == 0:
                progress.update(processed)
                # TODO handle bulk execute errors
                bulk.execute()
                bulk = self.combinations.initialize_unordered_bulk_op()

        progress.update(processed)
        bulk.execute()
        cursor.close()
        progress.end()
Ejemplo n.º 6
0
class SortService():
    def __init__(self, **kwargs):
        self.host = kwargs.get('host')
        self.database = kwargs.get('database')
        self.r_min = int(kwargs["r_min"])
        self.r_max = int(kwargs["r_max"])

        self.db = Repository(self.host, self.database)
        self.combinations = self.db.get_collection(kwargs.get('combinations'))

    def sort_pairings(self):
        """exceptions:

        """

        combo_filter = {
            "pairings": {
                "$gt": []
            },
            "r": {
                "$gte": self.r_min,
                "$lte": self.r_max
            }
        }

        records_count = self.combinations.count(combo_filter)

        progress = ProgressBar(records_count)
        processed = 0

        BULK_LIMIT = 1000
        bulk = self.combinations.initialize_unordered_bulk_op()

        output.push("Sorting pairings...")
        progress.start()

        cursor = self.combinations.find(combo_filter)
        for combo in cursor:
            combo_id = combo['_id']

            bulk.find({
                "_id": combo_id
            }).update(
                {"$push": {
                    "pairings": {
                        "$each": [],
                        "$sort": {
                            "score": -1
                        }
                    }
                }})

            processed += 1
            if processed % BULK_LIMIT == 0:
                progress.update(processed)
                # TODO handle bulk execute errors
                bulk.execute()
                bulk = self.combinations.initialize_unordered_bulk_op()

        progress.update(processed)
        bulk.execute()
        cursor.close()
        progress.end()
Ejemplo n.º 7
0
class AndCountService:
    def __init__(self, **kwargs):
        self.db = Repository(kwargs.get('host'), kwargs.get('database'))
        self.recipes = self.db.get_collection(kwargs.get('recipes'))
        self.combinations = self.db.get_collection(kwargs.get('combinations'))
        self.skip = kwargs.get('skip')
        self.r_min = kwargs.get('r_min')
        self.r_max = kwargs.get('r_max')

    def count_and(self):
        """exceptions:
            not a recipe data store
            source does not exist
            destination already exists
            r_max/r_min is not an integer
        """

        # TODO register exit handler to print recipes processed on unexpected exit
        # TODO https://docs.python.org/3/library/atexit.html

        recipe_count = self.recipes.count()
        cursor = self.recipes.find(no_cursor_timeout=True)
        # TODO timeout=False is bad practice
        if self.skip:
            cursor.skip(self.skip)
            processed = self.skip
        else:
            processed = 0

        output.push("Counting ands...")
        progress = ProgressBar(recipe_count)
        progress.start()

        for recipe in cursor:
            # TODO try collecting counts into a dictionary and then updating less frequently
            # TODO Also play with batch size
            bulk = self.combinations.initialize_unordered_bulk_op()

            ingredients = recipe['ingredients']
            ingredients.sort()

            # for each possible length of combinations between r_min and r_max
            r_min = int(self.r_min)
            r_max = int(self.r_max)
            r_max = int(r_max) if r_max and len(ingredients) > int(
                r_max) else len(ingredients)
            if r_min <= r_max:
                for r in range(r_min, r_max + 1):
                    combinations = itertools.combinations(ingredients, r)
                    # for each combination of that length
                    for c in combinations:
                        # ensure that ingredients in id are alphabetically ordered
                        c = list(c)
                        c.sort()
                        combo_id = '::'.join(c)
                        bulk.find({"_id": combo_id}).upsert()\
                            .update({
                                "$set": {
                                    "_id": combo_id,
                                    "r": r,
                                    "ingredients": c
                                },
                                "$inc": {
                                    "and_count": 1
                                }
                            }
                        )
                # TODO handle writeErrors
                bulk.execute()

            processed += 1
            progress.update(processed)

        cursor.close()
        progress.end()
Ejemplo n.º 8
0
class OrCountService:
    def __init__(self, **kwargs):
        self.host = kwargs.get('host')
        self.database = kwargs.get('database')

        self.db = Repository(self.host, self.database)
        self.combinations = self.db.get_collection(kwargs.get('combinations'))

        self.r_min = int(kwargs["r_min"])
        self.r_max = int(kwargs["r_max"])
        self.skip = kwargs.get("skip")

    def count_or(self):
        """exceptions:
            not a collections data store
            collection does not exist
            r_max/r_min is not an integer
            r_min/r_max are currently required (should be optional)
        """

        # TODO register exit handler to print recipes processed on unexpected exit
        # TODO https://docs.python.org/3/library/atexit.html

        combo_filter = {
            "r": {
                "$gte": self.r_min,
                "$lte": self.r_max
            },
            "or_count": {
                "$exists": False
            }
        }

        combination_count = self.combinations.count(combo_filter)
        cursor = self.combinations.find(combo_filter, no_cursor_timeout=True)

        # TODO timeout=False is bad practice
        if self.skip:
            cursor.skip(self.skip)
            processed = self.skip
        else:
            processed = 0

        progress = ProgressBar(combination_count)
        output.push("Counting ors...")
        progress.start()

        BULK_LIMIT = 1000
        bulk = self.combinations.initialize_unordered_bulk_op()
        for combination in cursor:

            combo_id = combination['_id']
            ingredients = combination['ingredients']

            # see https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
            or_count = 0
            add_sub = 1
            for r in range(1, len(ingredients) + 1):
                combinations = itertools.combinations(ingredients, r)
                for c in combinations:
                    c = list(c)
                    c.sort()
                    c_id = '::'.join(c)
                    and_count = self.get_and_count_by_id(c_id)
                    or_count += and_count * add_sub
                add_sub *= -1

            bulk.find({
                "_id": combo_id
            }).update({
                "$set": {
                    "or_count": or_count,
                    "score": float(combination['and_count']) / or_count
                }
            })

            processed += 1

            if processed % BULK_LIMIT == 0:
                # TODO handle bulk execute errors
                progress.update(processed)
                bulk.execute()
                bulk = self.combinations.initialize_unordered_bulk_op()

        progress.update(processed)
        bulk.execute()
        cursor.close()
        progress.end()

    # 1 billion is too big for current settings
    # lru cache with max size of 0.5 billion
    # TODO look into changing memory allowance for user running this process. Also limit mongo's max memory
    @functools.lru_cache(5 * 10**6)
    def get_and_count_by_id(self, combo_id):
        return self.combinations.find_one({"_id": combo_id})['and_count']