Exemple #1
0
def schedule_spider_searchengine(workspace_id, num_to_fetch, crawler_provider,
                                 crawler_sources, keyword_source_type):

    keywords = dao_get_keywords_by_relevance(workspace_id)
    categorized_urls = get_seeds_urls_categorized(workspace_id)

    job_id = save_job(workspace_id,
                      num_to_fetch=int(num_to_fetch),
                      crawler_provider=crawler_provider,
                      crawler_sources=crawler_sources,
                      crawl_type="KEYWORDS",
                      keyword_source_type=keyword_source_type)

    message = {
        'workspace': workspace_id,
        'jobId': job_id,
        'crawlProvider': crawler_provider,
        'crawlSources': crawler_sources,
        'strTimestamp': strftime("%Y-%m-%d %H:%M:%S", gmtime()),
        'keywordSourceType': keyword_source_type,
        'included': keywords['included'],
        'excluded': keywords['excluded'],
        'relevantUrl': categorized_urls['relevant'],
        'irrelevantUrl': categorized_urls['irrelevant'],
        'nResults': int(num_to_fetch),
        'existentUrl': get_seeds_urls_url(workspace_id)
    }

    Singleton.getInstance().broker_service.add_message_to_googlecrawler(
        message)
    return job_id
Exemple #2
0
def get_modeler_progress(workspace_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_workspace_collection()
    cursor = collection.find({'_id': ObjectId(workspace_id)}, {
        "page_model.model": 1,
        "page_model.quality": 1,
        "page_model.percentage_done": 1
    })
    docs = list(cursor)
    page_model_progress = {}
    for doc in docs:
        model = False
        quality = []
        percentage_done = 0
        if "page_model" in doc:
            if "percentage_done" in doc["page_model"]:
                percentage_done = doc["page_model"]["percentage_done"]
                if percentage_done > 99:
                    model = True
                    result = Singleton.getInstance(
                    ).es_client.get_modeler_model_results(workspace_id)
                    quality = json.loads(result["quality"])

        page_model_progress["model"] = model
        page_model_progress["quality"] = quality
        page_model_progress["percentageDone"] = percentage_done
        # progress = json.loads(page_model_progress)
    return page_model_progress
Exemple #3
0
def publish_to_import_url_queue(workspace_id, url):

    metadata = build_metadata(workspace_id)
    metadata["keywordSourceType"] = "MANUAL"

    message = {'url': url, 'isRelevant': True, 'metadata': metadata}
    Singleton.getInstance().broker_service.add_message_to_import_url(message)
def schedule_spider_searchengine(workspace_id, num_to_fetch,
                                 broad_crawler_provider,
                                 broad_crawler_sources):
    keywords = dao_get_keywords_by_relevance(workspace_id)
    categorized_urls = get_seeds_urls_categorized(workspace_id)
    job_id = save_job(workspace_id,
                      num_to_fetch=int(num_to_fetch),
                      broad_crawler_provider=broad_crawler_provider,
                      broad_crawler_sources=broad_crawler_sources,
                      crawl_type="KEYWORDS")

    message = {
        'included': keywords['included'],
        'excluded': keywords['excluded'],
        'relevantUrl': categorized_urls['relevant'],
        'irrelevantUrl': categorized_urls['irrelevant'],
        'nResults': int(num_to_fetch),
        'existentUrl': get_seeds_urls_url(workspace_id),
        'workspace': workspace_id,
        'jobId': job_id,
        'crawlProvider': broad_crawler_provider,
        'crawlSources': broad_crawler_sources
    }
    Singleton.getInstance().broker_service.add_message_to_googlecrawler(
        message)
    return job_id
Exemple #5
0
def dao_delete_keywords(workspace_id, hash):
    operation = {'$unset': {"words." + hash: ""}}
    # ws = Singleton.getInstance().mongo_instance.get_current_workspace()
    ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id)
    if ws == None:
        Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": "_default"}, operation)
    else:
        Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation)
def dao_update_user_account_status(id, is_active):

    update_object = {}

    if is_active is not None:
        update_object['active'] = is_active
        Singleton.getInstance().mongo_instance.get_user_collection().update(
            {'_id': ObjectId(id)}, {'$set': update_object})
def publish_to_login_output_queue(workspace_id, job_id, url, key_values):
    input_queue = "dd-login-output"
    message = {
        'workspaceId': workspace_id,
        'job_id': job_id,
        'url': url,
        'key_values': key_values
    }
    Singleton.getInstance().broker_service.post_to_queue_no_extra_headers(
        message, input_queue)
def publish_to_events_queue(workspace_id, event_type, action, arguments):

    message = {
        'workspaceId': workspace_id,
        'timestamp': time.time(),
        'event': event_type,
        'action': action,
        'arguments': json.dumps(arguments)
    }
    Singleton.getInstance().broker_service.add_message_to_events(message)
def publish_to_import_url_queue(workspace_id, url, is_relevant=True):

    message = {
        'url':
        url,
        'isRelevant':
        is_relevant,
        'metadata':
        Singleton.getInstance().broker_service.get_metadata(workspace_id)
    }
    Singleton.getInstance().broker_service.add_message_to_import_url(message)
Exemple #10
0
def publish_to_events_queue(workspace_id, event_type, action):

    message = {
        'event':
        event_type,
        'action':
        action,
        # 'data': po,
        'metadata':
        Singleton.getInstance().broker_service.get_metadata(workspace_id)
    }
    Singleton.getInstance().broker_service.add_message_to_events(message)
Exemple #11
0
def dao_save_blur_level(self, level):
    ws = Singleton.getInstance().mongo_instance.get_current_workspace()
    if ws is None:
        Singleton.getInstance().mongo_instance.workspace_collection.upsert(
            {'_id': '_default'}, {'$set': {
                'blur_level': level
            }})
    else:
        Singleton.getInstance().mongo_instance.workspace_collection.update(
            {'_id': ObjectId(ws['_id'])}, {'$set': {
                'blur_level': level
            }})
Exemple #12
0
def dao_add_workspace(name, ts):
    ws_doc = Singleton.getInstance(
    ).mongo_instance.workspace_collection.find_one({'name': name})
    if ws_doc is None:
        Singleton.getInstance().mongo_instance.workspace_collection.save({
            'name':
            name,
            'created':
            ts
        })
        # Singleton.getInstance().mongo_instance.create_collections_when_new_workspace(name)
    else:
        raise AddingWorkspaceError('The name already exists')
Exemple #13
0
def get_seeds_urls(workspace_id,
                   categories,
                   last_id,
                   limit,
                   _source_exclude=[
                       "result.crawlResultDto.image",
                       "result.crawlResultDto.html"
                   ]):

    categories_search_condition = {}
    if categories is not None:
        categories_search_condition = {
            'userDefinedCategory': {
                '$in': categories
            }
        }

    page_search_object = {}
    if last_id is not None:
        page_search_object = {"_id": {"$gt": ObjectId(last_id)}}

    deleted_search_object = {'deleted': None}
    workspace_search_object = {'workspaceId': workspace_id}
    field_names_to_include = [
        '_id', 'host', 'desc', 'crawlEntityType', 'url', 'words', 'title',
        'categories', 'language', 'relevant', 'userDefinedCategories'
    ]

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    res = collection \
        .find({'$and': [categories_search_condition, page_search_object, deleted_search_object, workspace_search_object]},
              field_names_to_include) \
        .sort('_id', pymongo.ASCENDING) \
        .limit(limit)

    docs = list(res)

    for item in docs:
        try:
            es_result = Singleton.getInstance(
            ).es_client.get_open_crawled_index_results(item['url'],
                                                       _source_exclude)
            item['desc'] = es_result["text"]
            if "crawlResultDto" in es_result and "html" in es_result[
                    "crawlResultDto"]:
                item['html'] = es_result["crawlResultDto"]["html"]
        except:
            logging.info("item failed")
    return docs
def queue_labels(workspace_id):
    docs = dao_get_labels(workspace_id)

    for doc in docs:
        location = build_html_location(doc["url"])
        doc["html_location"] = location

    message = {
        'workspace_id': workspace_id,
        'pages': docs,
    }

    Singleton.getInstance().broker_service.add_message_to_dd_modeler_input(
        message)
Exemple #15
0
def dao_update_keywords(workspace_id, word, score):

    # hash = hashlib.sha224(word.encode('utf_8')).hexdigest()
    hash = str(adler32(word.encode('utf_8')) & 0xffffffff)
    scoredWord = {"word": word, "score": score}
    operation = {'$set': {"words." + hash: scoredWord}}

    # ws = Singleton.getInstance().mongo_instance.get_current_workspace()
    ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id)
    if ws == None:
        Singleton.getInstance().mongo_instance.workspace_collection.upsert({"_id": "_default"}, operation)
    else:
        Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation)

    return hash
def dao_update_user(id, active, roleIds):

    update_object = {}

    if active != None:
        update_object['active'] = active

    if roleIds != None:
        new_roles = []
        for role in roleIds:
            new_roles.append(ObjectId(role))
        update_object['roles'] = new_roles

    Singleton.getInstance().mongo_instance.get_user_collection().update(
        {'_id': ObjectId(id)}, {'$set': update_object})
Exemple #17
0
def dao_aggregated_labels_urls(workspace_id):

    collection = Singleton.getInstance().mongo_instance.get_seed_urls_collection()
    source_search_conditions = []
    workspace_search_object = {'workspaceId': workspace_id}
    delete_search_object = {'deleted': {'$exists': False}}
    # 'deleted': {'$exists': False}}
    source_search_conditions.append(workspace_search_object)
    source_search_conditions.append(delete_search_object)

    source_search_object = {'$and': source_search_conditions}

    try:
        res = collection.aggregate([

            {'$match': source_search_object},
            {'$project': {'_id': 0, 'userDefinedCategories': 1}},
            {'$unwind': "$userDefinedCategories"},
            {'$group': {'_id': "$userDefinedCategories", 'tags': { '$sum': 1}}},
            {'$project': {'_id': 0, 'userDefinedCategories': "$_id", 'tags': 1}},
            {'$sort': {'userDefinedCategories': -1}}

        # {'$group': {'_id': {'userDefinedCategories': '$userDefinedCategories'}, "count": {"$sum": 1}}}

        ])
    except Exception as e:
        logging.error(e)

    return res["result"]
Exemple #18
0
def dao_get_keywords_by_relevance(workspace_id):

    keywords = {}
    included = []
    excluded = []
    related = []

    ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id)
    if ws is None \
            or "words" not in ws \
            or ws["words"] is None:
        logging.info("no keywords defined")
    else:
        for key, value in ws["words"].iteritems():
            if value['score'] > 3:
                included.append(value['word'])
            elif value['score'] < 3:
                excluded.append(value['word'])
            else:
                related.append(value['word'])

    if len(included) == 0:
        raise NameError('No keywords where defined.')

    keywords['included'] = included
    keywords['excluded'] = excluded
    keywords['related'] = related
    return keywords
def get_seeds_urls_to_deep_crawl_dao(workspace_id, page_size,
                                     keyword_source_type, last_id):

    and_condition_list = []

    if keyword_source_type:
        and_condition_list.append({"keywordSourceType": keyword_source_type})

    if last_id:
        last_id_search_object = {"_id": {"$gt": ObjectId(last_id)}}
        and_condition_list.append(last_id_search_object)

    deleted_search_object = {'deleted': {"$exists": False}}
    and_condition_list.append(deleted_search_object)

    workspace_search_object = {'workspaceId': workspace_id}
    and_condition_list.append(workspace_search_object)

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()

    res = collection.find({'$and': and_condition_list})\
        .sort('_id', pymongo.ASCENDING)\
        .limit(page_size)

    docs = list(res)
    return docs
def get_seeds_udcs_by_workspace_dao(workspace_id):

    and_condition_list = []

    # source_search_conditions = []
    # if source == "searchengine":
    #     source_search_conditions.append({'crawlEntityType': "BING"})
    #     source_search_conditions.append({'crawlEntityType': "GOOGLE"})
    # elif source == "twitter":
    #     source_search_conditions.append({'crawlEntityType': "TWITTER"})
    # elif source == "tor":
    #     source_search_conditions.append({'crawlEntityType': "TOR"})
    # elif source == "imported":
    #     source_search_conditions.append({'crawlEntityType': "MANUAL"})
    # elif source == "deepdeep":
    #     source_search_conditions.append({'crawlEntityType': "DD"})
    # else:
    #     print("no valid source was provided:" + source)
    # source_search_object = {'$or': source_search_conditions}
    # and_condition_list.append(source_search_object)

    deleted_search_object = {'deleted': None}
    and_condition_list.append(deleted_search_object)

    workspace_search_object = {'workspaceId': workspace_id}
    and_condition_list.append(workspace_search_object)

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    res = collection\
        .find({'$and': and_condition_list})\
        .distinct("udc")

    docs = list(res)
    return sorted(docs)
Exemple #21
0
def get_trainer_progress(workspace_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_workspace_collection()
    cursor = collection.find({'_id': ObjectId(workspace_id)})
    docs = list(cursor)
    trainer_progress = {}
    for doc in docs:
        progress = ""
        percentage_done = 0
        model = False
        if "dd_trainer" in doc and "trainer_progress" in doc["dd_trainer"]:
            progress = doc["dd_trainer"]["trainer_progress"]

        if "dd_trainer" in doc and "percentage_done" in doc["dd_trainer"]:
            percentage_done = doc["dd_trainer"]["percentage_done"]

        if "dd_trainer" in doc and "trainer_model" in doc["dd_trainer"]:
            model = doc["dd_trainer"]["trainer_model"]

        jobs = get_last_job_by_workspace_dao(workspace_id, 'DD-TRAINER')

    trainer_progress["progress"] = progress
    trainer_progress["percentageDone"] = percentage_done
    trainer_progress["model"] = model
    trainer_progress["jobs"] = jobs
    # trainer_progress ["model"] = true or false!!!!!;

    return trainer_progress
def get_user_input_forms(workspace_id, last_id):
    page_search_object = {}
    if last_id:
        # page_search_object = {'_id' > input_search_query['last_id']}
        page_search_object = {"_id": {"$gt": ObjectId(last_id)}}

    complete_search_object = {
        '$or': [{
            'completed': {
                '$exists': False
            }
        }, {
            'completed': {
                '$ne': True
            }
        }]
    }
    workspace_search_object = {'workspaceId': workspace_id}

    collection = Singleton.getInstance(
    ).mongo_instance.get_login_input_collection()
    res = collection\
        .find({'$and': [page_search_object, complete_search_object, workspace_search_object]})\
        .sort('_id', pymongo.ASCENDING)\
        .limit(5)

    docs = list(res)
    return docs
def get_all_progress(workspace_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_workspace_collection()
    cursor = collection.find({'_id': ObjectId(workspace_id)}, {
        "page_model.quality": 1,
        "dd_trainer.trainer_progress": 1,
        "dd_crawler.crawler_progress": 1,
        "dd_broadcrawler.broadcrawler_progress": 1
    })
    docs = list(cursor)

    #progress = []
    progress = {}
    progress["model"] = []
    progress["trainer"] = []
    progress["crawler"] = ""
    progress["broadcrawler"] = ""

    for doc in docs:
        if "page_model" in doc and "quality" in doc["page_model"]:
            progress_as_string = doc["page_model"]["quality"]
            progress["model"] = json.loads(progress_as_string)

        if "dd_trainer" in doc and "trainer_progress" in doc["dd_trainer"]:
            progress["trainer"] = doc["dd_trainer"]["trainer_progress"]

        if "dd_crawler" in doc and "crawler_progress" in doc["dd_crawler"]:
            progress["crawler"] = doc["dd_crawler"]["crawler_progress"]

        # FIXME this should exist, broacrawler is called just crawler
        # if "dd_broadcrawler" in doc and "broadcrawler_progress" in doc["dd_broadcrawler"]:
        #     progress["broadcrawler"] = doc["dd_broadcrawler"]["broadcrawler_progress"]

    return progress
Exemple #24
0
def dao_reset_results(workspace_id, source):

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()

    source_search_conditions = []

    workspace_search_object = {'workspaceId': workspace_id}

    if source == "searchengine":
        source_search_conditions.append({'crawlEntityType': "BING"})
        source_search_conditions.append({'crawlEntityType': "GOOGLE"})
    elif source == "twitter":
        source_search_conditions.append({'crawlEntityType': "TWITTER"})
    elif source == "tor":
        source_search_conditions.append({'crawlEntityType': "TOR"})
    elif source == "imported":
        source_search_conditions.append({'crawlEntityType': "MANUAL"})
    elif source == "deepdeep":
        source_search_conditions.append({'crawlEntityType': "DD"})
    else:
        print("no valid source was provided:" + source)
        return
    source_search_object = {'$or': source_search_conditions}

    collection.remove(
        {'$and': [workspace_search_object, source_search_object]})
def get_user_input_forms_stats(workspace_id):
    search_object = {
        '$and': [
            {
                'workspaceId': workspace_id
            },
        ]
    }
    collection = Singleton.getInstance(
    ).mongo_instance.get_login_input_collection()
    res_hosts = collection.aggregate([{
        '$match': search_object
    }, {
        '$project': {
            "_id": 0,
            "completed_label": {
                "$cond": ["$completed", "completed", "pending"]
            },
            "completed": {
                "$cond": ["$completed", 1, 1]
            }
        }
    }, {
        '$group': {
            "_id": "$completed_label",
            "count": {
                '$sum': "$completed"
            }
        }
    }])
    return list(res_hosts["result"])
def dao_aggregate_urls_to_deep_crawl(workspace_id):

    and_search_conditions = []

    workspace_search_object = {'workspaceId': workspace_id}
    and_search_conditions.append(workspace_search_object)

    delete_search_object = {'deleted': {'$exists': False}}
    and_search_conditions.append(delete_search_object)

    source_search_object = {'$and': and_search_conditions}

    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()

    try:
        res = collection.aggregate([{
            '$match': source_search_object
        }, {
            '$group': {
                '_id': {
                    'keywordSourceType': '$keywordSourceType'
                },
                "count": {
                    "$sum": 1
                }
            }
        }])
    except Exception as e:
        print e

    return res["result"]
Exemple #27
0
def dao_delete_workspace(id):
    Singleton.getInstance().mongo_instance.workspace_collection.remove(
        {"_id": ObjectId(id)})
    Singleton.getInstance().mongo_instance.get_crawl_job_collection().remove(
        {"workspaceId": id})
    Singleton.getInstance().mongo_instance.get_seed_urls_collection().remove(
        {"workspaceId": id})
    Singleton.getInstance().mongo_instance.get_broad_crawler_collection(
    ).remove({"workspaceId": id})
Exemple #28
0
def dao_update_relevance(url, obj):
    update_object = {}
    update_object['relevant'] = obj['relevant']
    collection = Singleton.getInstance(
    ).mongo_instance.get_current_seed_urls_collection()
    print "setting url %s to %s in collection %s" % (url, str(
        obj['relevant']), collection)
    collection.update({"url": url}, {'$set': update_object}, True)
def dao_update_relevanceByid(workspace_id, id, relevance, categories, udc):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    update_object = {}
    update_object['relevant'] = relevance
    update_object['categories'] = categories
    update_object['udc'] = udc
    collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)
Exemple #30
0
def get(workspace_id):
    ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id)
    if ws is None \
            or "userDefinedCategories" not in ws \
            or ws["userDefinedCategories"] is None:
        return []
    else:
        return ws["userDefinedCategories"]