def scrape_page(team_id, domain_id, trail_id, url, content, userEmail): content = urllib.unquote(content).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() (features, errors) = extractors.extractAll(content) for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: " + error) for type, values in features.iteritems(): connector.insert_entities(url, type, values) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches( domain_id, type, values) if len(features_in_domain) > 0: tangelo.log("INSERTING DOMAIN ENTITIES") tangelo.log(type) tangelo.log(features_in_domain) connector.insert_domain_entities(str(domain_id), url, type, features_in_domain) id = db.addBrowsePathData(team_id, domain_id, trail_id, url, userEmail) count = db.getUrlCount(team_id, domain_id, trail_id, url) result = dict(id=id, count=count) return json.dumps(result)
def scrape_page(team_id,domain_id,trail_id,url,content,userEmail): content = urllib.unquote(content).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() (features,errors) = extractors.extractAll(content) for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) for type,values in features.iteritems(): connector.insert_entities(url,type,values) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain_id,type,values) if len(features_in_domain) > 0: tangelo.log("INSERTING DOMAIN ENTITIES") tangelo.log(type) tangelo.log(features_in_domain) connector.insert_domain_entities(str(domain_id),url, type, features_in_domain) id = db.addBrowsePathData(team_id,domain_id,trail_id,url, userEmail) count = db.getUrlCount(team_id,domain_id,trail_id, url) result = dict(id=id, count=count) return json.dumps(result)
def scrape_page(team_id,domain_id,trail_id,url,content,user_email): content = urllib.unquote(content).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() # blacklist of pages to not extract data from blacklist = config.get_extraction_blacklist() if urlparse(url).netloc not in blacklist: (features,errors) = extractors.extractAll(content) for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) for type,values in features.iteritems(): connector.insert_entities(url,type,values) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain_id,type,values) if len(features_in_domain) > 0: tangelo.log("INSERTING DOMAIN ENTITIES") tangelo.log(type) connector.insert_domain_entities(str(domain_id),url, type, features_in_domain) # we also don't want to export blacklisted pages. tangelo.log("Calling export") export_to_services(domain_id, team_id, trail_id, url, content, user_email, features) else: tangelo.log("Url: %s IN blacklist"%url) id = db.addBrowsePathData(team_id,domain_id,trail_id,url, user_email) count = db.getUrlCount(team_id,domain_id,trail_id, url) result = dict(id=id, count=count) return json.dumps(result)
def scrape_page(html, url, userId, userName, trail, domain, org): #tangelo.log('USER NAME: ' + userName) domain = domain.encode('utf-8') org = org.encode('utf-8') html = urllib.unquote(html).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() (features,errors) = extractors.extractAll(html) tangelo.log(features) for type,values in features.iteritems(): connector.insert_entities(url,type,values) #for value in values: # tangelo.log("EXTRACTED: "+type+"\t"+value) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain,type,values) if len(features_in_domain) > 0: connector.insert_domain_entities(domain,url, type, features_in_domain) #tangelo.log("EXTRACTED "+str(len(features_in_domain))+" DOMAIN FEATURES") for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain) # get number of times this url appears in the database count = db.getUrlCount(org, url, domain=domain) result = dict(id=id, count=count) #tangelo.log("POSTED url:" + url + " return: " + str(result)) return json.dumps(result)
def scrape_page(html, url, userId, userName, trail, domain, org): #tangelo.log('USER NAME: ' + userName) domain = domain.encode('utf-8') org = org.encode('utf-8') html = urllib.unquote(html).encode('utf-8') url = url.encode('utf-8') #tangelo.log('posting url contents to kafka: ' + url) kafka_producer.sendVisitingMessage(org, domain, str(userId), url, html) # add the row to the database id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain) # get number of times this url appears in the database count = db.getUrlCount(org, url, domain=domain) result = dict(id=id, count=count) #tangelo.log("POSTED url:" + url + " return: " + str(result)) return json.dumps(result)