def summary(vectorizer, file): # gets file sentences doc = open(file, 'r', encoding='iso-8859-1').read() sentences = sent_tokenize(preprocessor(doc)) # calculate tfidf vector for document sentences and all document vectors = vectorizer.transform_tfidf(sentences) docVector = vectorizer.transform_tfidf([doc]) # calculate similarity for each sentence sim = [] for vector in vectors: sim.append(similarity(vector, docVector[0])) # calculate MMR value for each sentence selected = [] var = 0.05 while len(selected) < 5: mmr = [] for s in range(len(sim)): mmr_value = (1 - var) * sim[s] for sentence in selected: mmr_value -= var * similarity(vectors[s], vectors[sentence]) mmr.append(mmr_value) indexOfMax = max(enumerate(mmr), key=lambda x: x[1])[0] selected.append(indexOfMax) sim[indexOfMax] = 0 # returns the list of selected sentences res = [] for i in selected: res.append(sentences[i]) return res
def summary(vectorizer, file, alternativeApproach=False): # gets file sentences doc = open(file, 'r', encoding='iso-8859-1').read() sentences = sent_tokenize(preprocessor(doc)) # simple approach: use document sentences if alternativeApproach == False: vectorizer.fit(sentences) vectors = vectorizer.transform_bm25(sentences) docVector = vectorizer.transform_bm25([doc]) else: # calculate tfidf vector for document sentences and all document vectors = vectorizer.transform_tfidf(sentences) docVector = vectorizer.transform_tfidf([doc]) # calculate similarity for each sentence sim = [] for vector in vectors: sim.append(similarity(vector, docVector[0])) # select document summary summary = sorted(enumerate(sim), key=lambda s: s[1], reverse=True)[:5] summary.sort() # returns the list of selected sentences res = [] for i, s in enumerate(summary, start=1): res.append(sentences[s[0]]) return res
def extract_features(source): def count_tags(tags, label): tag_counter = Counter(tags) counts = 0 if label == 'nouns': for tag in tag_counter.keys(): if tag in ['N', 'NPROP', 'PROPESS']: counts += tag_counter.get(tag) elif label == 'verbs': for tag in tag_counter.keys(): if tag in ['V', 'VAUX', 'ADV', 'ADV-KS', 'ADV-KS-REL']: counts += tag_counter.get(tag) elif label == 'adjectives': for tag in tag_counter.keys(): if tag in ['ADJ']: counts += tag_counter.get(tag) return counts sents = filter_list(sent_tokenize(source)) feature_pos = [i + 1 for i in range(len(sents))] vectorizer = CustomVectorizer() vectorizer.fit( sents) # whole document must be split before fitting TODO -> filter tfidf_source = vectorizer.transform_tfidf([source])[0] tfidf_sents = vectorizer.transform_tfidf(sents) feature_sim = [ similarity(tfidf_sent, tfidf_source) for tfidf_sent in tfidf_sents ] tagger = customtagger.load_tagger() tagged = [tagger.tag(filter_list(word_tokenize(sent))) for sent in sents] sent_tags = [list(map(lambda t: t[1], tags)) for tags in tagged] feature_nouns = [count_tags(tags, 'nouns') for tags in sent_tags] #feature_verbs = [count_tags(tags, 'verbs') for tags in sent_tags] #feature_adjectives = [count_tags(tags, 'adjectives') for tags in sent_tags] return sents, feature_pos, feature_sim, feature_nouns #, feature_verbs, feature_adjectives
def summarizeDocument(file): ''' Parse the file for sentences ''' doc = io.open(file, 'r', encoding='utf').read() doc = preprocessor(doc) sentences = sent_tokenize(doc) ''' Compute vector space representations of every sentence. It will treat each sentence as a document and so use the correct values (sentence frequency). !!! The tf-idf values computed by this vectorizer are not in accordance to what is requested. Documentation states that tf is simply the count of each word in each doc/sentence (and so, not normalized), and 1 is added to all idf values. To meet the requirements, for each term, we would need to subtract its tf (as described above) and then divide by the maximal tf in that doc/sentence. I think we need to use a CounterVectorizer first !!! ''' vectorizer = CustomVectorizer(input='content', stopwords=list(stopwords.words('english'))) vectors = vectorizer.fit(sentences) vectors = vectorizer.transform_tfidf(sentences) ''' Transform the document into a single sentence and use the vectorizer to model it in the same feature space. ''' docVector = vectorizer.transform_tfidf([doc]) ''' For each sentence vector, reduce the document vector to the same dimension space, to be able to compute the dot product -> similarity ''' sim = [] for vector in vectors: sim.append(similarity(vector, docVector[0])) summary = sorted(enumerate(sim), key=lambda s: s[1], reverse=True)[:5] summary.sort() ''' Returns the list of selected sentences ''' res = [] for s in summary: res.append(sentences[s[0]]) return res
text = file.read() sents = sent_tokenize(text) file.close() vectorizer = CustomVectorizer(stopwords=stopwords.words()) vectorizer.fit(sents) vecs = vectorizer.transform_tfidf(sents) graph = {i: [] for i in range(len(vecs))} threshold = 0.1 for i in range(len(vecs)): for j in range(i + 1, len(vecs)): if similarity(vecs[i], vecs[j]) > threshold: graph[i].append(j) graph[j].append(i) graph = {k: list(set(graph[k])) for k in graph.keys()} rank, i = e1.page_rank_mod(graph) summary = sorted(rank.keys(), key=lambda k: rank[k], reverse=True)[:5] summary.sort() print('(done)') print('\nBuilding html page...') generateHTML(sents, summary) print('(done)')
vectorizer.fit(sents) vecs = vectorizer.transform_tfidf(sents) source_score = vectorizer.transform_tfidf([source])[0] graph = defaultdict(lambda: []) weights_tfidf = defaultdict(lambda: {}) weights_alternative = [] # TODO # Build graph threshold = 0.1 for i, v1 in enumerate(vecs): for j, v2 in enumerate(vecs[i + 1:], start=i + 1): sim = similarity(v1, v2) if sim > threshold: graph[i].append(j) graph[j].append(i) weights_tfidf[i][j] = sim weights_tfidf[j][i] = sim l = len(graph.keys()) n = sum(l / (i + 1) for i in range(l)) # priors must be a probability distribution -> sum equals 1 priors_pos = {k: (l / (k + 1)) / n for k in graph.keys()} priors_tfidf = { k: similarity(vecs[k], source_score) for k in graph.keys()