def extract_links(self, html): for anchor in html.findAll('a'): uri = urlparse(anchor['href']) if uri.netloc in ('neupy.com', ''): url = urljoin(self.url, uri.path) if uri.fragment: url = url + "#" + uri.fragment yield Link(uri=url, text=anchor.text)
data = [] logging.info("Collecting documents") all_documents = collect_documents(SITE_DIR) logging.info("Define relations between documents") webgraph = WebPageGraph.create_from_documents(all_documents) for document in all_documents: logging.debug('Processing "%s"', document.uri) text = document.text text = text.lower().replace('.', ' ').replace('=', ' ') anchor_texts = [] for _, link in webgraph.page_linked_by(Link(document.uri)): if link.text: anchor_texts.append(link.text) text = ' '.join([text] + anchor_texts) for term in nltk.word_tokenize(text): if term not in vocabulary: vocabulary[term] = len(vocabulary) termid = vocabulary[term] term_frequency[termid] += 1 indeces.append(termid) data.append(1)