def vectorize(self): """ Returns a tuple of vectors representing this article. Articles are represented by: (bag of words vector, entities vector) """ if self.vectors is None: bow_vec = vectorize(self.text) ent_vec = vectorize(' '.join(entities(self.text))) self.vectors = [bow_vec, ent_vec] return self.vectors
def classify(self, docs, num_topics=5): """ Classify a list of documents. Args: | docs (list) -- the documents to classify (a list of strings) | num_topics (int) -- number of top predicted topics to return for each doc. Returns: | list -- the list of lists of document topics. """ # Returns a 2d array, where each array is # a list of probabilities for labels. docs_ = vectorize(docs) probs = self.clf.predict_proba(docs_) # This will sort the *indices* of the inner arrays, instead of the actual values. # These indices correspond with labels. # It goes from low to high. probs_sorted = probs.argsort() # Slice all the inner arrays to get `num_topics` top probabilities (their indices). probs_top = probs_sorted[:, -num_topics:] # Convert the indices to the actual labels, and return. return [self.clf.classes_[probs_indices] for prob_indices in top_probs]
def train(self, docs, labels): """ Train the classifier with documents and labels. The training can be online. That is, an existing classifier can be updated with new training data. Args: | docs (list) -- the documents to train on (a list of strings) | labels (list) -- the labels to train on (a list of lists of strings) """ docs_ = vectorize(docs) self.clf.partial_fit(docs_, labels)
def digest(self): """ Will process this instance's dump. """ # Check if the specified file exists. if not exists(self.file): logger.info('Specified file {0} not found, fetching...'.format( self.file)) self.fetch_dump() logger.info('Beginning digestion of pages.') # Process pages and collect their text content ("docs"). docs = [self._process_page(elem) for elem in self._iterate_pages()] logger.info('Vectorizing the page documents...') # Vectorize the docs. doc_vecs = brain.vectorize(docs) # Testing #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb') #import pickle #pickle.dump(doc_vecs, outfile) # Pickle the docs to save to Mongo. #_doc_vecs = self.db().pickle(doc_vecs) #processed_name = self.url if self.url else self.file #self.db().add({'dump': processed_name, 'docs': _doc_vecs}) #self.db().close() # Generate TF-IDF representation # of all docs upon completion. #self._generate_tfidf(docs) logger.info('Digestion complete!') if not self.silent: processed_name = self.url if self.url else self.file notify( 'TF-IDF calculations complete for {0}!'.format(processed_name))
def digest(self): """ Will process this instance's dump. """ # Check if the specified file exists. if not exists(self.file): logger.info('Specified file {0} not found, fetching...'.format(self.file)) self.fetch_dump() logger.info('Beginning digestion of pages.') # Process pages and collect their text content ("docs"). docs = [self._process_page(elem) for elem in self._iterate_pages()] logger.info('Vectorizing the page documents...') # Vectorize the docs. doc_vecs = brain.vectorize(docs) # Testing #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb') #import pickle #pickle.dump(doc_vecs, outfile) # Pickle the docs to save to Mongo. #_doc_vecs = self.db().pickle(doc_vecs) #processed_name = self.url if self.url else self.file #self.db().add({'dump': processed_name, 'docs': _doc_vecs}) #self.db().close() # Generate TF-IDF representation # of all docs upon completion. #self._generate_tfidf(docs) logger.info('Digestion complete!') if not self.silent: processed_name = self.url if self.url else self.file notify('TF-IDF calculations complete for {0}!'.format(processed_name))
def multisummarize(docs, summary_length=5): """ Summarize multi documents. Args: | docs (list) -- list of documents (i.e. texts) | summary_length (int) -- the preferred sentence length of the summary (default=5) .. note:: The current implementation is super naive, thus the quality and coherence of its summaries is pretty damn terrible. But it's purpose for now is that there is *some* API for multidoc summarization. Returns: | summary (list) -- list of sentences selected for the summary. .. note:: BTW: this is super slow. takes well over a minute for 4 moderately-sized documents. """ # Collect all sentences from the input documents. # Also collect position information about each sentence. sents = [] for doc in docs: sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sentences(doc))] clusters = [] # Cluster the sentences. for sent in sents: # sent = (sent, vec, pos) # Keep track of the maximum scoring cluster # (above some minimum similarity) # and the avg sim score. min_sim = 0.01 max_cluster = None, min_sim for cluster in clusters: avg_sim = 0 for sent_c in cluster: avg_sim += (1 - cosine(sent[1], sent_c[1])) avg_sim = avg_sim / len(cluster) if avg_sim >= max_cluster[1]: max_cluster = cluster, avg_sim # If a cluster was found, # add the sentence to it if max_cluster[0]: max_cluster[0].append(sent) # Otherwise, create a new cluster. else: clusters.append([sent]) # Rank the clusters. # Assuming that clusters with more sentences are more important, # take the top 5. ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length] # For each sentence cluster, select the highest scoring sentence. # Again - very naive. ideal_length = 20 summary_sentences = [] for cluster in ranked_clusters: max_sent = None, 0 for sent in cluster: avg_sim = 0 for sent_c in cluster: avg_sim += 1 - cosine(sent[1], sent_c[1]) avg_sim = avg_sim / len(cluster) pos = sent[2] length = fabs(ideal_length - len(tokenize(sent[0]))) / ideal_length # Score is the average similarity penalized by distance from ideal length, # weighted by the inverse of the position. score = (avg_sim - length / 2) / pos if score >= max_sent[1]: max_sent = sent[0], score summary_sentences.append(max_sent[0]) return summary_sentences
def multisummarize(docs, summary_length=5): """ Summarize multi documents. Args: | docs (list) -- list of documents (i.e. texts) | summary_length (int) -- the preferred sentence length of the summary (default=5) .. note:: The current implementation is super naive, thus the quality and coherence of its summaries is pretty damn terrible. But it's purpose for now is that there is *some* API for multidoc summarization. Returns: | summary (list) -- list of sentences selected for the summary. .. note:: BTW: this is super slow. takes well over a minute for 4 moderately-sized documents. """ # Collect all sentences from the input documents. # Also collect position information about each sentence. sents = [] for doc in docs: sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sentences(doc))] clusters = [] # Cluster the sentences. for sent in sents: # sent = (sent, vec, pos) # Keep track of the maximum scoring cluster # (above some minimum similarity) # and the avg sim score. min_sim = 0.01 max_cluster = None, min_sim for cluster in clusters: avg_sim = 0 for sent_c in cluster: avg_sim += (1 - cosine(sent[1], sent_c[1])) avg_sim = avg_sim/len(cluster) if avg_sim >= max_cluster[1]: max_cluster = cluster, avg_sim # If a cluster was found, # add the sentence to it if max_cluster[0]: max_cluster[0].append(sent) # Otherwise, create a new cluster. else: clusters.append([sent]) # Rank the clusters. # Assuming that clusters with more sentences are more important, # take the top 5. ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length] # For each sentence cluster, select the highest scoring sentence. # Again - very naive. ideal_length = 20 summary_sentences = [] for cluster in ranked_clusters: max_sent = None, 0 for sent in cluster: avg_sim = 0 for sent_c in cluster: avg_sim += 1 - cosine(sent[1], sent_c[1]) avg_sim = avg_sim/len(cluster) pos = sent[2] length = fabs(ideal_length - len(tokenize(sent[0])))/ideal_length # Score is the average similarity penalized by distance from ideal length, # weighted by the inverse of the position. score = (avg_sim - length/2)/pos if score >= max_sent[1]: max_sent = sent[0], score summary_sentences.append(max_sent[0]) return summary_sentences