def batch_extract_content(websiteElementsPath, urlData): ## 1) Extract webpage data print "[INFO] ==== Extracting webpage data ====" data_extractor = WebsiteDataExtractor(websiteElementsPath) out = pd.DataFrame(urlData["URL"]) keyterms = [] for url in urlData["URL"]: print url data_dict = data_extractor.crawlPage(url) ## 2) Extract candidate keyterms print "[INFO] ==== Extracting candidate keyterms ====" keyterm_extractor = KeyTermExtractor(data_dict) keyterm_extractor.execute() #print keyterm_extractor.result_dict ## 3) Compute candidate keyterm features print "[INFO] ==== Computing candidate keyterm features ====" keyterm_feat = KeyTermFeatures(url, data_dict, keyterm_extractor.result_dict, lang=utils.LANG_FR) candidate_keyterm_df = keyterm_feat.compute_features() selected_keyterms = [] if not candidate_keyterm_df.empty: ## 4) Filter for relevancy and output top 10 keyterms print "[INFO] ==== Selecting relevant keyterms ====" relevance_filter = RelevanceFilter(candidate_keyterm_df, "dataset/keyterm-classifier-model-v2.pickle", topk=10) selected_keyterms = relevance_filter.select_relevant() keyterms.append(",".join(selected_keyterms)) out["keyterms"] = keyterms return out
def create_raw_dataset(output_filename): df_raw = read_raw_data("dataset/preProc2_lower.json") page_scraper = WebsiteDataExtractor("dataset/WebsiteElementsPathDef.xml") # training dataset is made up only of pages published in 2015 df_dataset = df_raw.loc[df_raw['dateTime'].map(lambda x: x.year >= 2015)] # get all URLs all_urls = [x[1] for x in df_dataset['link'].iteritems()] # get urls common with grapeshot - these will be our test set test_urls = None with open("dataset/extracted_terms_grapeshot_common_v3.json") as fp: d = json.load(fp, encoding="utf-8") test_urls = d.keys() train_urls = [x for x in all_urls if x not in test_urls] train_urls = random.sample(train_urls, 4 * len(test_urls)) # dataset urls are test + train dataset_urls = test_urls + train_urls dataset_dict = {} idx = 1 for url in dataset_urls: print "[INFO] " + str(idx) + " :: Parsing URL: " + url page_data = page_scraper.crawlPage(url) dataset_dict[url] = page_data idx += 1 with open(output_filename, mode="w") as fp: json.dump(dataset_dict, fp, encoding="utf-8") with open("dataset/test_url_list.json", mode="w") as fp: json.dump(test_urls, fp, encoding="utf-8") with open("dataset/train_url_list.json", mode="w") as fp: json.dump(train_urls, fp, encoding="utf-8") print "[INFO] Page scraping dataset created."
def get_candidate_keyterms_dataset(output_file, url_list): from website_data_extractor import WebsiteDataExtractor from keyterm_extractor import KeyTermExtractor2 data_scraper = WebsiteDataExtractor("dataset/WebsiteElementsPathDef.xml") candidate_extractor = KeyTermExtractor2(tagger, lang="french") candidate_extractor.initialize() dataset_dict = {} for link in url_list: print "Processing URL: " + link data_dict = data_scraper.crawlPage(link) candidate_extractor.execute(data_dict) dataset_dict[link] = candidate_extractor.candidates candidate_extractor.cleanup() with open(output_file, "w") as fp: json.dump(dataset_dict, fp)
def _initialize(self): self.tagger = ttw.TreeTagger(TAGLANG=self.lang_abrev, TAGDIR=KeyTermExtractor2.TREETAGGER_DIR) self.data_scraper = WebsiteDataExtractor("dataset/WebsiteElementsPathDef.xml") self.candidate_extractor = KeyTermExtractor2(self.tagger, lang = self.lang) self.candidate_extractor.initialize() self.feature_extractor = KeyTermFeatures2(self.tagger, lang = self.lang) #self.relevance_filter = RelevanceFilter("dataset/keyterm-classifier-model-v3.pickle", topk = self.topk) #self.relevance_filter = RelevanceFilter("dataset/keyterm-classifier-model-updated.pickle", topk = self.topk) self.relevance_filter = RelevanceFilter("dataset/keyterm-classifier-model-general.pickle", topk = self.topk) self.keytermClassifier = KeytermClassification( classesFile="dataset/top10-keywords-ecommerce-filtered.txt", classesClusterPath="dataset/keyterm_clustering/top_adv_keyterm_clusters.dump")
__author__ = "alex" import pprint import utils.functions as utils from website_data_extractor import WebsiteDataExtractor from keyterm_extractor import KeyTermExtractor, KeyTermExtractor2 from keyterm_features import KeyTermFeatures from keyterm_classifier import RelevanceFilter if __name__ == "__main__": url = 'http://www.generation-nt.com/blackview-a8-smartphone-petit-budget-pas-cher-mwc-2016-actualite-1925283.html' ## 1) Extract webpage data print "[INFO] ==== Extracting webpage data ====" data_extractor = WebsiteDataExtractor("dataset/WebsiteElementsPathDef.xml") data_dict = data_extractor.crawlPage(url) ## 2) Extract candidate keyterms print "[INFO] ==== Extracting candidate keyterms ====" keyterm_extractor = KeyTermExtractor(data_dict) keyterm_extractor.execute() keyterm_extractor2 = KeyTermExtractor2(data_dict, lang="french") keyterm_extractor2.execute() print "======== Results from Extractor 1 ========" pprint.pprint(keyterm_extractor.result_dict) # print "Nr t1grams: " + str(len(keyterm_extractor.result_dict['t1gram']['term'])) # print "Nr t2grams: " + str(len(keyterm_extractor.result_dict['t2gram']['term'])) # print "Nr t3grams: " + str(len(keyterm_extractor.result_dict['t3gram']['term']))
class KeytermServerExtractor(object): def __init__(self, port = 8080, lang = utils.LANG_FR, topk = 10): print "Initializing Term Extractor Server" ## setup server port self.port = port self.topk = topk ## setup keyterm service extraction language self.lang = lang self.lang_abrev = utils.LANG_ABREV[lang] ## setup http request handling classes self.server_class = HTTPServer self.handler_class = makeServerHandlerClass(self) ## setup logging ## setup logging root_log = logging.getLogger() root_log.setLevel(logging.ERROR) stdout_log = logging.StreamHandler(sys.stdout) stdout_log.setLevel(logging.ERROR) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') stdout_log.setFormatter(formatter) root_log.addHandler(stdout_log) ## initialize keyterm extraction service modules self._initialize() def _initialize(self): self.tagger = ttw.TreeTagger(TAGLANG=self.lang_abrev, TAGDIR=KeyTermExtractor2.TREETAGGER_DIR) self.data_scraper = WebsiteDataExtractor("dataset/WebsiteElementsPathDef.xml") self.candidate_extractor = KeyTermExtractor2(self.tagger, lang = self.lang) self.candidate_extractor.initialize() self.feature_extractor = KeyTermFeatures2(self.tagger, lang = self.lang) #self.relevance_filter = RelevanceFilter("dataset/keyterm-classifier-model-v3.pickle", topk = self.topk) #self.relevance_filter = RelevanceFilter("dataset/keyterm-classifier-model-updated.pickle", topk = self.topk) self.relevance_filter = RelevanceFilter("dataset/keyterm-classifier-model-general.pickle", topk = self.topk) self.keytermClassifier = KeytermClassification( classesFile="dataset/top10-keywords-ecommerce-filtered.txt", classesClusterPath="dataset/keyterm_clustering/top_adv_keyterm_clusters.dump") def _cleanup(self): self.tagger = None self.data_scraper.cleanup() self.candidate_extractor.cleanup() self.feature_extractor.cleanup() self.relevance_filter.cleanup() def runServer(self): server_address = ('', self.port) httpd = self.server_class(server_address, self.handler_class) print 'Starting httpd...' try: httpd.serve_forever() except KeyboardInterrupt: self._cleanup() sys.exit(0) except Exception as ex: logging.getLogger().exception("Error in keyterm extraction!") sys.exit(0) def extracTermsFromLink(self, link): default_return = { "available_domains": ["http://www.generation-nt.com/", "http://www.maison.com/", "http://www.journaldugeek.com/", "http://www.journaldugamer.com/", "http://www.jdubuzz.com/", "http://news.pixelistes.com/", "http://www.societe.com/", "http://www.pausecafein.fr/", "http://worldofwarcraft.judgehype.com/news/", "http://hearthstone.judgehype.com/news/", "http://diablo3.judgehype.com/news/", "http://www.judgehype.com/news/", "http://www.jeuxonline.info", "http://heroes.judgehype.com/news/", "http://overwatch.judgehype.com/news/", "http://film-warcraft.judgehype.com/news/", "http://judgehype.com/", "http://portail.free.fr/", "http://www.planet.fr/", "http://aliceadsl.closermag.fr/", "http://aliceadsl.lemonde.fr/", "http://aliceadsl.gqmagazine.fr/"], "defaultPath": False, "dataIntegrity":False, "keyTerms":[]} try: ## 1) Extract webpage data print "[INFO] ==== Extracting webpage data ====" data_dict = self.data_scraper.crawlPage(link) default_return["defaultPath"] = data_dict["defaultPath"] default_return["dataIntegrity"] = data_dict["dataIntegrity"] if data_dict["defaultPath"] or not data_dict["dataIntegrity"]: return default_return #pprint.pprint(data_dict) ## 2) Extract candidate keyterms print "[INFO] ==== Extracting candidate keyterms ====" self.candidate_extractor.execute(data_dict) # print keyterm_extractor.result_dict ## 3) Compute candidate keyterm features print "[INFO] ==== Computing candidate keyterm features ====" candidate_keyterm_df = self.feature_extractor.compute_features(link, data_dict, self.candidate_extractor.candidates) ## 4) Filter for relevancy and output top 10 keyterms print "[INFO] ==== Selecting relevant keyterms ====" selected_keyterms = self.relevance_filter.select_relevant(candidate_keyterm_df, self.candidate_extractor.candidates) # print "[INFO] ==== FINAL SELECTION =====" default_return["keyTerms"] = selected_keyterms return default_return except: return default_return def extractTermsFromText(self, text): default_return = { "available_domains": ["http://www.generation-nt.com/", "http://www.maison.com/", "http://www.journaldugeek.com/", "http://www.journaldugamer.com/", "http://www.jdubuzz.com/", "http://news.pixelistes.com/", "http://www.societe.com/", "http://www.pausecafein.fr/", "http://worldofwarcraft.judgehype.com/news/", "http://hearthstone.judgehype.com/news/", "http://diablo3.judgehype.com/news/", "http://www.judgehype.com/news/", "http://www.jeuxonline.info", "http://heroes.judgehype.com/news/", "http://overwatch.judgehype.com/news/", "http://film-warcraft.judgehype.com/news/", "http://judgehype.com/", "http://portail.free.fr/", "http://www.planet.fr/", "http://aliceadsl.closermag.fr/", "http://aliceadsl.lemonde.fr/", "http://aliceadsl.gqmagazine.fr/"], "defaultPath": False, "dataIntegrity": False, "keyTerms": []} try: candidate_keyterms = self.candidate_extractor.execute_with_snippet(text) keyterms = self.filter_candidates_from_snippet(candidate_keyterms) default_return["keyTerms"] = keyterms return default_return except: return default_return def recommendKeytermsForBase(self, link): default_return = { "type": "Recommendations based on clustering.", "text_used_from_link": ["title", "description", "keywords", "urlTokens"], "keyTerms_recommandations": []} try: ## 1) Extract webpage data print "[INFO] ==== Extracting webpage data USING Specific PathDef====" data_dict = self.data_scraper.crawlPage(link, elementsPathDef="baseCluster") #Check integrity of list if len(data_dict) <= 0: return default_return # #TEST # default_return["crawled_data"] = data_dict #Simple extraction of possible terms (not using trained model) #Concatanate into text all components with sentence separation text_for_analysis = u'' for key, value in data_dict.iteritems(): if isinstance(value, basestring): text_for_analysis = text_for_analysis + ". " + value + ". " elif isinstance(value, list): text_for_analysis = text_for_analysis + ". ".join(value) # #TEST # default_return["text_for_analysis"] = text_for_analysis #pprint.pprint(data_dict) ## 2) Extract candidate keyterms print "[INFO] ==== Extracting candidate keyterms ====" candidates = self.candidate_extractor.execute_with_snippet(text_for_analysis) #SHOW CANDIDATES # default_return["keyTerms_candidates"] = candidates ## 3) Compute keyterm recommendations comparing cluster centroids print "[INFO] ==== Computing keyterm recommendations ====" orig_list, keyterm_recommendations = self.keytermClassifier.match_adv_keyterm_clusters_base(candidates, min_similarity_threshold=0.5) # print "[INFO] ==== FINAL SELECTION =====" default_return["keyTerms_recommandations"] = keyterm_recommendations return default_return except: return default_return def recommendKeytermsSimple(self, link): default_return = { "type": "Simple recommendation based only on individual extracted keyterms.", "defaultPath": False, "dataIntegrity": False, "keyTerms_recommandations": []} try: ## 1) Extract webpage data print "[INFO] ==== Extracting Terms From Link ====" keyterms = self.extracTermsFromLink(link)["keyTerms"] if len(keyterms) <= 0: return default_return ##2) Compute keyterm recommendations comparing cluster centroids print "[INFO] ==== Computing keyterm recommendations ====" orig_list, keyterm_recommendations = self.keytermClassifier.match_adv_keyterm_website(keyterms, min_similarity_threshold=0.5, min_diff_distance=0.90, top=5) print "[INFO] ==== FINAL SELECTION =====" default_return["keyTerms_recommandations"] = list(keyterm_recommendations) return default_return except: return default_return def filter_candidates_from_snippet(self, candidate_keyterms): from functools import cmp_to_key ordered_keyterms = sorted(candidate_keyterms.itervalues(), key = lambda item: item['cvalue'], reverse = True) selected_keyterms = [item for item in ordered_keyterms if item['cvalue'] > 0] def pos_cmp(keyterm1, keyterm2): if not "NAM" in keyterm1['pos'] and "NAM" in keyterm2['pos']: return -1 elif "NAM" in keyterm1['pos'] and "NAM" not in keyterm2['pos']: return 1 else: return 0 filtered_keyterms = sorted(selected_keyterms, key=cmp_to_key(pos_cmp), reverse=True) keyterms = [{'term' : " ".join(t['words']), 'cvalue': t['cvalue'], 'lemma': t['lemma_string'], 'pos_tag': t['pos']} for t in filtered_keyterms] return keyterms