Ejemplo n.º 1
0
def scrape_page(team_id, domain_id, trail_id, url, content, userEmail):

    content = urllib.unquote(content).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()
    (features, errors) = extractors.extractAll(content)
    for error in errors:
        tangelo.log("FEATURE EXTRACTION ERROR: " + error)

    for type, values in features.iteritems():
        connector.insert_entities(url, type, values)
        if len(values) > 0:
            features_in_domain = connector.get_domain_entity_matches(
                domain_id, type, values)
            if len(features_in_domain) > 0:
                tangelo.log("INSERTING DOMAIN ENTITIES")
                tangelo.log(type)
                tangelo.log(features_in_domain)
                connector.insert_domain_entities(str(domain_id), url, type,
                                                 features_in_domain)

    id = db.addBrowsePathData(team_id, domain_id, trail_id, url, userEmail)
    count = db.getUrlCount(team_id, domain_id, trail_id, url)
    result = dict(id=id, count=count)
    return json.dumps(result)
Ejemplo n.º 2
0
def scrape_page(team_id,domain_id,trail_id,url,content,userEmail):

    content = urllib.unquote(content).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()
    (features,errors) = extractors.extractAll(content)
    for error in errors:
        tangelo.log("FEATURE EXTRACTION ERROR: "+error)


    for type,values in features.iteritems():
        connector.insert_entities(url,type,values)
        if len(values) > 0:
            features_in_domain = connector.get_domain_entity_matches(domain_id,type,values)
            if len(features_in_domain) > 0:
                tangelo.log("INSERTING DOMAIN ENTITIES")
                tangelo.log(type)
                tangelo.log(features_in_domain)
                connector.insert_domain_entities(str(domain_id),url, type, features_in_domain)



    id = db.addBrowsePathData(team_id,domain_id,trail_id,url, userEmail)
    count = db.getUrlCount(team_id,domain_id,trail_id, url)
    result = dict(id=id, count=count)
    return json.dumps(result)
Ejemplo n.º 3
0
def scrape_page(team_id,domain_id,trail_id,url,content,user_email):

    content = urllib.unquote(content).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()

    # blacklist of pages to not extract data from
    blacklist = config.get_extraction_blacklist()
    if urlparse(url).netloc not in blacklist:
        (features,errors) = extractors.extractAll(content)
        for error in errors:
            tangelo.log("FEATURE EXTRACTION ERROR: "+error)

        for type,values in features.iteritems():
            connector.insert_entities(url,type,values)
            if len(values) > 0:
                features_in_domain = connector.get_domain_entity_matches(domain_id,type,values)
                if len(features_in_domain) > 0:
                    tangelo.log("INSERTING DOMAIN ENTITIES")
                    tangelo.log(type)
                    connector.insert_domain_entities(str(domain_id),url, type, features_in_domain)
        # we also don't want to export blacklisted pages.
        tangelo.log("Calling export")
        export_to_services(domain_id, team_id, trail_id, url, content, user_email, features)
    else:
        tangelo.log("Url: %s IN blacklist"%url)

    id = db.addBrowsePathData(team_id,domain_id,trail_id,url, user_email)

    count = db.getUrlCount(team_id,domain_id,trail_id, url)
    result = dict(id=id, count=count)
    return json.dumps(result)
Ejemplo n.º 4
0
def scrape_page(html, url, userId, userName, trail, domain, org):
    #tangelo.log('USER NAME: ' + userName)
    domain = domain.encode('utf-8')
    org = org.encode('utf-8')
    html = urllib.unquote(html).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()
    (features,errors) = extractors.extractAll(html)
    tangelo.log(features)
    for type,values in features.iteritems():
        connector.insert_entities(url,type,values)
        #for value in values:
        #    tangelo.log("EXTRACTED: "+type+"\t"+value)
        if len(values) > 0:
            features_in_domain = connector.get_domain_entity_matches(domain,type,values)
            if len(features_in_domain) > 0:
                connector.insert_domain_entities(domain,url, type, features_in_domain)
                #tangelo.log("EXTRACTED "+str(len(features_in_domain))+" DOMAIN FEATURES")


    for error in errors:
        tangelo.log("FEATURE EXTRACTION ERROR: "+error)


    id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain)
    # get number of times this url appears in the database
    count = db.getUrlCount(org, url, domain=domain)
    result = dict(id=id, count=count)



    #tangelo.log("POSTED url:" + url + "  return: " + str(result))
    return json.dumps(result)
Ejemplo n.º 5
0
def scrape_page(html, url, userId, userName, trail, domain, org):
    #tangelo.log('USER NAME: ' + userName)
    domain = domain.encode('utf-8')
    org = org.encode('utf-8')
    html = urllib.unquote(html).encode('utf-8')
    url = url.encode('utf-8')
    #tangelo.log('posting url contents to kafka: ' + url)
    kafka_producer.sendVisitingMessage(org, domain, str(userId), url, html)
    # add the row to the database

    id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain)

    # get number of times this url appears in the database
    count = db.getUrlCount(org, url, domain=domain)
    result = dict(id=id, count=count)
    #tangelo.log("POSTED url:" + url + "  return: " + str(result))
    return json.dumps(result)
Ejemplo n.º 6
0
def scrape_page(html, url, userId, userName, trail, domain, org):
    #tangelo.log('USER NAME: ' + userName)
    domain = domain.encode('utf-8')
    org = org.encode('utf-8')
    html = urllib.unquote(html).encode('utf-8')
    url = url.encode('utf-8')
    #tangelo.log('posting url contents to kafka: ' + url)
    kafka_producer.sendVisitingMessage(org, domain, str(userId), url, html)
    # add the row to the database

    id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain)

    # get number of times this url appears in the database
    count = db.getUrlCount(org, url, domain=domain)
    result = dict(id=id, count=count)
    #tangelo.log("POSTED url:" + url + "  return: " + str(result))
    return json.dumps(result)