Esempio n. 1
0
def dao_aggregate_urls(workspace_id):

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    source_search_conditions = []
    workspace_search_object = {'workspaceId': workspace_id}
    delete_search_object = {'deleted': {'$exists': False}}
    # 'deleted': {'$exists': False}}
    source_search_conditions.append(workspace_search_object)
    source_search_conditions.append(delete_search_object)

    source_search_object = {'$and': source_search_conditions}

    try:
        res = collection.aggregate([

            # '$group': {'_id': '$crawlEntityType', "count": {"$sum": 1}}
            {
                '$match': source_search_object
            },
            {
                '$group': {
                    '_id': {
                        'crawlEntityType': '$crawlEntityType',
                        'relevant': '$relevant'
                    },
                    "count": {
                        "$sum": 1
                    }
                }
            }
        ])
    except Exception as e:
        print e

    return res["result"]
Esempio n. 2
0
def unlabel(url_id, user_defined_category):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    operation = {'$pull': {"userDefinedCategories": user_defined_category}}
    collection.update({"_id": ObjectId(url_id)}, operation)
Esempio n. 3
0
def dao_get_workspace_by_id(id):
    return Singleton.getInstance(
    ).mongo_instance.workspace_collection.find_one({'_id': ObjectId(id)})
def get_seeds_urls_to_label_dao(workspace_id, page_size, sources, relevances,
                                categories, keyword_source_type, last_id,
                                last_source):

    and_condition_list = []
    #sources
    if len(sources) > 0:
        source_search_conditions = []
        for source in sources:
            if source == "searchengine":
                source_search_conditions.append({'crawlEntityType': "BING"})
                source_search_conditions.append({'crawlEntityType': "GOOGLE"})
            elif source == "tor":
                source_search_conditions.append({'crawlEntityType': "TOR"})
            elif source == "imported":
                source_search_conditions.append({'crawlEntityType': "MANUAL"})
            elif source == "deepdeep":
                source_search_conditions.append({'crawlEntityType': "DD"})
            else:
                print("no valid source was provided:" + source)

        source_search_object = {'$or': source_search_conditions}
        and_condition_list.append(source_search_object)

    #relevances
    if len(relevances) > 0:
        relevances_search_conditions = []
        for relevance in relevances:
            if relevance == "unset":
                relevances_search_conditions.append(
                    {'relevant': {
                        "$exists": False
                    }})
            else:
                relevances_search_conditions.append({
                    "$and": [{
                        'relevant': relevance
                    }, {
                        'relevant': {
                            "$exists": True
                        }
                    }]
                })

        relevances_search_object = {'$or': relevances_search_conditions}
        and_condition_list.append(relevances_search_object)

    #page_types
    if len(categories) > 0:
        categories_search_conditions = []
        for category in categories:
            categories_search_conditions.append({'categories': category})

        categories_search_object = {'$or': categories_search_conditions}
        and_condition_list.append(categories_search_object)

    page_search_object = {}
    if last_id is not None and last_source is not None:
        #even bigger from same source, or any from other source
        page_search_object = {
            '$or': [{
                "$and": [{
                    "_id": {
                        "$gt": ObjectId(last_id)
                    }
                }, {
                    "crawlEntityType": last_source
                }]
            }, {
                "crawlEntityType": {
                    "$ne": last_source
                }
            }]
        }
        and_condition_list.append(page_search_object)

    deleted_search_object = {'deleted': {"$exists": False}}
    and_condition_list.append(deleted_search_object)

    workspace_search_object = {'workspaceId': workspace_id}
    and_condition_list.append(workspace_search_object)

    sort_dict = OrderedDict()
    sort_dict['order'] = 1
    sort_dict['_id'] = 1

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()

    res = collection.aggregate([{
        "$project": {
            '_id': 1,
            'host': 1,
            'crawlEntityType': 1,
            'url': 1,
            'title': 1,
            'relevant': 1,
            'workspaceId': 1,
            'deleted': 1,
            "order": {
                "$cond": {
                    "if": {
                        "$eq": ["$crawlEntityType", "DD"]
                    },
                    "then": 1,
                    "else": {
                        "$cond": {
                            "if": {
                                "$eq": ["$crawlEntityType", "TOR"]
                            },
                            "then": 2,
                            "else": {
                                "$cond": {
                                    "if": {
                                        "$eq": ["$crawlEntityType", "GOOGLE"]
                                    },
                                    "then": 3,
                                    "else": {
                                        "$cond": {
                                            "if": {
                                                "$eq":
                                                ["$crawlEntityType", "BING"]
                                            },
                                            "then": 4,
                                            "else": 5
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }, {
        "$match": {
            '$and': and_condition_list
        }
    }, {
        "$sort": sort_dict
    }, {
        "$limit": page_size
    }, {
        "$project": {
            '_id': 1,
            'host': 1,
            'crawlEntityType': 1,
            'url': 1,
            'title': 1,
            'relevant': 1,
            'order': 1
        }
    }])

    docs = list(res["result"])
    return docs
Esempio n. 5
0
def dao_delete(id):
    Singleton.getInstance().mongo_instance.get_user_collection().remove(
        {"_id": ObjectId(id)})
Esempio n. 6
0
def get_tasks_by_job(job_id):
    return Singleton.getInstance().mongo_instance.get_crawl_task_collection(
    ).find_one({'jobId': job_id})
Esempio n. 7
0
def get_last_job_by_workspace_dao(workspace_id, crawl_type):
    docs = Singleton.getInstance().mongo_instance.get_crawl_job_collection()\
        .find({'workspaceId': workspace_id, 'crawlType': crawl_type})\
        .limit(1)\
        .sort('_id', pymongo.DESCENDING)
    return list(docs)
Esempio n. 8
0
def cancel_job(job_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_crawl_job_collection()
    operation = {'$set': {"status": "CANCELLED"}}
    collection.update({"_id": ObjectId(job_id)}, operation)
Esempio n. 9
0
def dao_list_workspace():
    # docs = Singleton.getInstance().mongo_instance.workspace_collection.find({}, {'name': 1, 'created': 1, 'words': 1}).sort('created', pymongo.ASCENDING)
    docs = Singleton.getInstance().mongo_instance.workspace_collection.find(
        {}).sort('created', pymongo.ASCENDING)

    return list(docs)
def __get_seeds_url_by_selection(workspace_id, selection):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()

    or_sources_conditions = []

    for key, value in selection.iteritems():
        and_source_conditions = []
        workspace_search_object = {'workspaceId': workspace_id}
        and_source_conditions.append(workspace_search_object)

        # deleted_search_object = {'deleted': None}
        # and_source_conditions.append(deleted_search_object)

        source_search_conditions = []
        keywordSourceType = value["source"]

        source_search_conditions.append(
            {'keywordSourceType': keywordSourceType})

        # if keywordSourceType == "FETCHED":
        #     source_search_conditions.append({'crawlEntityType': "BING"})
        # elif keywordSourceType == "MANUAL":
        #
        # if source == "searchengine":
        #     source_search_conditions.append({'crawlEntityType': "BING"})
        #     source_search_conditions.append({'crawlEntityType': "GOOGLE"})
        # elif source == "deepdeep":
        #     source_search_conditions.append({'crawlEntityType': "DD"})
        #
        # elif source == "tor":
        #     source_search_conditions.append({'crawlEntityType': "TOR"})
        # elif source == "imported":
        #     source_search_conditions.append({'crawlEntityType': "MANUAL"})
        # else:
        #     print("no valid source was provided:" + source)

        source_search_object = {'$or': source_search_conditions}
        and_source_conditions.append(source_search_object)

        if value["allSelected"]:
            object_ids = []
            for id in value["unselected"]:
                object_ids.append(ObjectId(id))
            ids_search_object = {'_id': {'$nin': object_ids}}

            and_source_conditions.append(ids_search_object)
        else:
            object_ids = []
            for id in value["selected"]:
                object_ids.append(ObjectId(id))

            ids_search_object = {'_id': {'$in': object_ids}}
            and_source_conditions.append(ids_search_object)

        or_sources_conditions.append({'$and': and_source_conditions})

    cursor = collection.find({'$or': or_sources_conditions}, {
        '_id': 0,
        'url': 1
    })
    # docs = list(cursor)
    urls = []
    for item in cursor:
        urls.append(item["url"])

    return urls
Esempio n. 11
0
def get_seeds_urls_url(workspace_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    res = collection.find({'workspaceId': workspace_id}, {'_id': 0, 'url': 1})
    docs = list(res)
    return docs
Esempio n. 12
0
def get_seeds_urls_by_workspace_dao(workspace_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    return list(collection.find({'workspaceId': workspace_id}))
Esempio n. 13
0
def dao_delete_seed_url(workspace_id, id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    update_object = {}
    update_object['deleted'] = True
    collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)
Esempio n. 14
0
def dao_update_relevanceByid(workspace_id, id, relevance):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    update_object = {}
    update_object['relevant'] = relevance
    collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)
Esempio n. 15
0
def upsert(workspace_id, user_defined_category):
    operation = {'$addToSet': {"userDefinedCategories": user_defined_category}}
    ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id)
    Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation)
def dao_count_workspace():

    count = Singleton.getInstance().mongo_instance.workspace_collection\
        .count()

    return count
Esempio n. 17
0
def delete(workspace_id, user_defined_category):
    ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id)
    operation = {'$pull': {"userDefinedCategories": user_defined_category}}
    Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation)
Esempio n. 18
0
if __name__ == "__main__":

    # usage: $ python runserver.py --logging-level=debug --logging-file=debug.log
    parser = optparse.OptionParser()
    parser.add_option('-l', '--logging-level', help='Logging level')
    parser.add_option('-f', '--logging-file', help='Logging file name')
    (options, args) = parser.parse_args()
    logging_level = LOGGING_LEVELS.get(options.logging_level, logging.INFO)
    logging.basicConfig(level=logging_level,
                        filename=options.logging_file,
                        format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

    app_instance = str(uuid.uuid1())
    instance = Singleton.getInstance()
    instance.app_instance = app_instance
    instance.mongo_instance = MongoInstance(app.config['MONGO_HOST_NAME'],
                                            app.config['MONGO_HOST_PORT'])
    instance.broker_service = BrokerService(app_instance,
                                            app.config['KAFKA_HOST_NAME'],
                                            app.config['KAFKA_HOST_PORT'])
    instance.es_client = ElasticsearchClient(app)
    instance.broker_service.init_subscribers()

    # Create database connection object
    app.config['MONGODB_HOST'] = app.config['MONGO_HOST_NAME']
    app.config['MONGODB_PORT'] = app.config['MONGO_HOST_PORT']
    app.config['MONGODB_DB'] = 'MemexHack'
    db = MongoEngine(app)
Esempio n. 19
0
def get_jobs_by_workspace_dao(workspace_id):
    docs = Singleton.getInstance().mongo_instance.get_crawl_job_collection()\
        .find({'workspaceId': workspace_id})\
        .sort('_id', pymongo.DESCENDING)

    return list(docs)
Esempio n. 20
0
def get_job_dao(job_id):
    return Singleton.getInstance().mongo_instance.get_crawl_job_collection().find_one({"_id": ObjectId(job_id)})
Esempio n. 21
0
def get_job_dao(job_id):
    return Singleton.getInstance().mongo_instance.get_crawl_job_collection(
    ).find_one({'jobId': job_id})
Esempio n. 22
0
def dao_count_jobs(input_search_query):
    collection = Singleton.getInstance().mongo_instance.get_crawl_job_collection()
    query = {"workspaceId": input_search_query["workspace_id"]}
    count = collection.find(query).count()
    return count
Esempio n. 23
0
def dao_get_all():
    docs = Singleton.getInstance().mongo_instance.get_user_collection().find(
    ).sort('email', pymongo.ASCENDING)
    return list(docs)
Esempio n. 24
0
def dao_get_blur_level(self):
    ws = Singleton.getInstance().mongo_instance.get_current_workspace()
    if ws is None or "blur_level" not in ws or ws["blur_level"] is None:
        return 0
    else:
        return ws["blur_level"]
Esempio n. 25
0
def dao_get_roles_all():
    docs = Singleton.getInstance().mongo_instance.get_role_collection().find(
    ).sort('name', pymongo.ASCENDING)
    return list(docs)
def get_seeds_urls_all_labeled_dao(workspace_id, page_size, sources,
                                   relevances, last_id):

    and_condition_list = []

    if len(sources) > 0:
        source_search_conditions = []
        for source in sources:
            if source == "searchengine":
                source_search_conditions.append({'crawlEntityType': "BING"})
                source_search_conditions.append({'crawlEntityType': "GOOGLE"})
            elif source == "tor":
                source_search_conditions.append({'crawlEntityType': "TOR"})
            elif source == "imported":
                source_search_conditions.append({'crawlEntityType': "MANUAL"})
            elif source == "deepdeep":
                source_search_conditions.append({'crawlEntityType': "DD"})
            else:
                print("no valid source was provided:" + source)

        source_search_object = {'$or': source_search_conditions}
        and_condition_list.append(source_search_object)

    if len(relevances) > 0:
        relevances_search_conditions = []
        for relevance in relevances:
            if relevance == "unset":
                relevances_search_conditions.append(
                    {'relevant': {
                        "$exists": False
                    }})
            else:
                relevances_search_conditions.append({
                    "$and": [{
                        'relevant': relevance
                    }, {
                        'relevant': {
                            "$exists": True
                        }
                    }]
                })

        relevances_search_object = {'$or': relevances_search_conditions}
        and_condition_list.append(relevances_search_object)

    if last_id:
        last_id_search_object = {"_id": {"$gt": ObjectId(last_id)}}
        and_condition_list.append(last_id_search_object)

    deleted_search_object = {'deleted': {"$exists": False}}
    and_condition_list.append(deleted_search_object)

    workspace_search_object = {'workspaceId': workspace_id}
    and_condition_list.append(workspace_search_object)

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    res = collection.find({'$and': and_condition_list})\
        .sort('_id', pymongo.ASCENDING)\
        .limit(page_size)

    docs = list(res)
    return docs