Beispiel #1
0
def get(domain, trail):
    org = helper.get_org()

    trail_report = {}

    # get all stared urls for the trail
    for (url, rank) in db.getRankedUrls(org, trail, domain):
        trail_report[url] = {
            'rank': rank,
        }

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        trail_report.keys())
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    filtered_values.append(value)
            if len(filtered_values) > 0:
                try:
                    if 'auto_features' not in trail_report[url]:
                        trail_report[url]['auto_features'] = {}
                    trail_report[url]['auto_features'][type] = filtered_values
                except:
                    tangelo.log("report generation error. skipping url.")
                    continue

    # for each url get any manually extracted entities
    for url in trail_report.keys():
        for featureObj in db.get_feedback_entities(org, domain, url):
            if 'manual_features' not in trail_report[url]:
                trail_report[url]['manual_features'] = {}
            if featureObj['type'] not in trail_report[url]['manual_features']:
                trail_report[url]['manual_features'][featureObj['type']] = []
            trail_report[url]['manual_features'][featureObj['type']].append(
                featureObj['value'])

    # for each url get any highlighted text
    for url in trail_report.keys():
        selections = db.getSelections(domain, trail, url, org)
        if len(selections) > 0:
            trail_report[url]['selections'] = selections

    result = {'trail': trail, 'urls': trail_report}
    return json.dumps(result, sort_keys=True, indent=4, separators=(',', ':'))
Beispiel #2
0
def get(domain,trail,stars,newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,'auto generated domain from trail: '+trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail


    for (url,rank) in db.getRankedUrls(org,trail,domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org,trail)
        for url in urls:
           url_set.add(url)


    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type,value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)


    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set)
    for url,featureDict in all_entities.iteritems():
        for type,values in featureDict.iteritems():
            type = type.replace(',',' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',',' ')
                    features.add(type+"\0"+value)



    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',',' ')
            value = featureObj['value'].replace(',',' ')
            features.add(type+"\0"+value)





    entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))
Beispiel #3
0
def get(domain, trail, stars, newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,
                          'auto generated domain from trail: ' + trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail

    for (url, rank) in db.getRankedUrls(org, trail, domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org, trail)
        for url in urls:
            url_set.add(url)

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        url_set)
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            type = type.replace(',', ' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',', ' ')
                    features.add(type + "\0" + value)

    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',', ' ')
            value = featureObj['value'].replace(',', ' ')
            features.add(type + "\0" + value)

    entity_data_connector.add_new_domain_items(
        map(lambda x: newdomain + '\0' + x, features))
def fetch_entities(domain, url):
    org = session_helper.get_org()
    entities = db.get_feedback_entities(org, domain, url)
    return json.dumps(dict(entities=entities))
Beispiel #5
0
def fetch_entities(domain, url):
    org = session_helper.get_org()
    entities = db.get_feedback_entities(org, domain, url)
    return json.dumps(dict(entities=entities))