def batch_extract_content(websiteElementsPath, urlData): ## 1) Extract webpage data print "[INFO] ==== Extracting webpage data ====" data_extractor = WebsiteDataExtractor(websiteElementsPath) out = pd.DataFrame(urlData["URL"]) keyterms = [] for url in urlData["URL"]: print url data_dict = data_extractor.crawlPage(url) ## 2) Extract candidate keyterms print "[INFO] ==== Extracting candidate keyterms ====" keyterm_extractor = KeyTermExtractor(data_dict) keyterm_extractor.execute() #print keyterm_extractor.result_dict ## 3) Compute candidate keyterm features print "[INFO] ==== Computing candidate keyterm features ====" keyterm_feat = KeyTermFeatures(url, data_dict, keyterm_extractor.result_dict, lang=utils.LANG_FR) candidate_keyterm_df = keyterm_feat.compute_features() selected_keyterms = [] if not candidate_keyterm_df.empty: ## 4) Filter for relevancy and output top 10 keyterms print "[INFO] ==== Selecting relevant keyterms ====" relevance_filter = RelevanceFilter(candidate_keyterm_df, "dataset/keyterm-classifier-model-v2.pickle", topk=10) selected_keyterms = relevance_filter.select_relevant() keyterms.append(",".join(selected_keyterms)) out["keyterms"] = keyterms return out
from keyterm_features import KeyTermFeatures from keyterm_classifier import RelevanceFilter if __name__ == "__main__": url = 'http://www.generation-nt.com/blackview-a8-smartphone-petit-budget-pas-cher-mwc-2016-actualite-1925283.html' ## 1) Extract webpage data print "[INFO] ==== Extracting webpage data ====" data_extractor = WebsiteDataExtractor("dataset/WebsiteElementsPathDef.xml") data_dict = data_extractor.crawlPage(url) ## 2) Extract candidate keyterms print "[INFO] ==== Extracting candidate keyterms ====" keyterm_extractor = KeyTermExtractor(data_dict) keyterm_extractor.execute() keyterm_extractor2 = KeyTermExtractor2(data_dict, lang="french") keyterm_extractor2.execute() print "======== Results from Extractor 1 ========" pprint.pprint(keyterm_extractor.result_dict) # print "Nr t1grams: " + str(len(keyterm_extractor.result_dict['t1gram']['term'])) # print "Nr t2grams: " + str(len(keyterm_extractor.result_dict['t2gram']['term'])) # print "Nr t3grams: " + str(len(keyterm_extractor.result_dict['t3gram']['term'])) # print "Nr t4grams: " + str(len(keyterm_extractor.result_dict['t4gram']['term'])) print "======== Results from Extractor 2 ========" pprint.pprint(keyterm_extractor2.result_dict) # print "Nr t1grams: " + str(len(set(keyterm_extractor2.result_dict['t1gram']))) # print "Nr t2grams: " + str(len(set(keyterm_extractor2.result_dict['t2gram'])))