def get_text(self, text_type): """ Return extracted text from the html. Extract text if neccessary NOTE: this function's flow can be confusing cause it does not only serve as extraction but also cache the extracted text in different scenarios. Parameters: ----------- text_type: string, optional """ if not self.html: return '' if text_type == 'body': if not self.body: self.body = Text_Extractor.extract_body(self.html) self.body = URLUtility.clean_text(self.body) return self.body elif text_type == 'meta': if not self.meta: self.meta = Text_Extractor.extract_body(self.html) self.meta = URLUtility.clean_text(self.meta) return self.meta elif text_type == 'title': if not self.title: self.title = Text_Extractor.extract_body(self.html) self.title = URLUtility.clean_text(self.title) return self.title else: print "Wrong text_type" return ''
def _extract_keywords(self, sites, k=10): """ Extract top k most frequent keywords. Skip ones that were selected. """ stop = stopwords.words('english') counter = Counter() for site in sites: for p in site: text = p.get_text('meta') text = URLUtility.clean_text(text) words = nltk.word_tokenize(text) words = [ word for word in words if word not in stop and len(word) > 2 ] bigram_words = [ words[i] + ' ' + words[i + 1] for i in xrange(len(words) - 1) ] counter += Counter(words + bigram_words) # Get the topk words """ counter = [(counter[w], w) for w in counter if counter[w]>1] # convert to array heapq.heapify(counter) topk = heapq.nlargest(k, counter) return [w[1] for w in topk] """ top_words = counter.most_common(k + len(self.keywords)) result = [] # list of keywords to return i = 0 while len(result) < k and i < len(top_words): if top_words[i][0] not in self.keywords: result.append(top_words[i][0]) self.keywords.add(top_words[i][0]) i += 1 print " List of selected keywords: ", result return result
def extract_keywords(sites, k=10): """ Extract top k most frequent keywords """ stop = stopwords.words('english') counter = Counter() for site in sites: for p in site: text = p.get_text('meta') text = URLUtility.clean_text(text) words = word_tokenize(text) words = [ word for word in words if word not in stop and len(word) > 2 ] counter += Counter(words) # Get the topk words counter = [(counter[w], w) for w in counter if counter[w] > 1] # convert to array heapq.heapify(counter) topk = heapq.nlargest(k, counter) print "Top extracted keywords: ", topk return [w[1] for w in topk]