Esempio n. 1
0
def summary(vectorizer, file):

    # gets file sentences
    doc = open(file, 'r', encoding='iso-8859-1').read()
    sentences = sent_tokenize(preprocessor(doc))

    # calculate tfidf vector for document sentences and all document
    vectors = vectorizer.transform_tfidf(sentences)
    docVector = vectorizer.transform_tfidf([doc])

    # calculate similarity for each sentence
    sim = []
    for vector in vectors:
        sim.append(similarity(vector, docVector[0]))

    # calculate MMR value for each sentence
    selected = []
    var = 0.05
    while len(selected) < 5:
        mmr = []
        for s in range(len(sim)):
            mmr_value = (1 - var) * sim[s]
            for sentence in selected:
                mmr_value -= var * similarity(vectors[s], vectors[sentence])
            mmr.append(mmr_value)
        indexOfMax = max(enumerate(mmr), key=lambda x: x[1])[0]
        selected.append(indexOfMax)
        sim[indexOfMax] = 0

    # returns the list of selected sentences
    res = []
    for i in selected:
        res.append(sentences[i])
    return res
Esempio n. 2
0
def summary(vectorizer, file, alternativeApproach=False):

    # gets file sentences
    doc = open(file, 'r', encoding='iso-8859-1').read()
    sentences = sent_tokenize(preprocessor(doc))

    # simple approach: use document sentences
    if alternativeApproach == False:
        vectorizer.fit(sentences)
        vectors = vectorizer.transform_bm25(sentences)
        docVector = vectorizer.transform_bm25([doc])
    else:
        # calculate tfidf vector for document sentences and all document
        vectors = vectorizer.transform_tfidf(sentences)
        docVector = vectorizer.transform_tfidf([doc])

    # calculate similarity for each sentence
    sim = []
    for vector in vectors:
        sim.append(similarity(vector, docVector[0]))

    # select document summary
    summary = sorted(enumerate(sim), key=lambda s: s[1], reverse=True)[:5]
    summary.sort()

    # returns the list of selected sentences
    res = []
    for i, s in enumerate(summary, start=1):
        res.append(sentences[s[0]])
    return res
def extract_features(source):
    def count_tags(tags, label):

        tag_counter = Counter(tags)
        counts = 0

        if label == 'nouns':
            for tag in tag_counter.keys():
                if tag in ['N', 'NPROP', 'PROPESS']:
                    counts += tag_counter.get(tag)

        elif label == 'verbs':
            for tag in tag_counter.keys():
                if tag in ['V', 'VAUX', 'ADV', 'ADV-KS', 'ADV-KS-REL']:
                    counts += tag_counter.get(tag)

        elif label == 'adjectives':
            for tag in tag_counter.keys():
                if tag in ['ADJ']:
                    counts += tag_counter.get(tag)

        return counts

    sents = filter_list(sent_tokenize(source))

    feature_pos = [i + 1 for i in range(len(sents))]

    vectorizer = CustomVectorizer()
    vectorizer.fit(
        sents)  # whole document must be split before fitting TODO -> filter

    tfidf_source = vectorizer.transform_tfidf([source])[0]
    tfidf_sents = vectorizer.transform_tfidf(sents)

    feature_sim = [
        similarity(tfidf_sent, tfidf_source) for tfidf_sent in tfidf_sents
    ]

    tagger = customtagger.load_tagger()

    tagged = [tagger.tag(filter_list(word_tokenize(sent))) for sent in sents]

    sent_tags = [list(map(lambda t: t[1], tags)) for tags in tagged]

    feature_nouns = [count_tags(tags, 'nouns') for tags in sent_tags]

    #feature_verbs = [count_tags(tags, 'verbs') for tags in sent_tags]

    #feature_adjectives = [count_tags(tags, 'adjectives') for tags in sent_tags]

    return sents, feature_pos, feature_sim, feature_nouns  #, feature_verbs, feature_adjectives
def summarizeDocument(file):
    '''
    Parse the file for sentences
    '''
    doc = io.open(file, 'r', encoding='utf').read()
    doc = preprocessor(doc)
    sentences = sent_tokenize(doc)
    '''
    Compute vector space representations of every sentence.
    It will treat each sentence as a document and so use
    the correct values (sentence frequency).
    !!!
    The tf-idf values computed by this vectorizer are not
    in accordance to what is requested. Documentation states
    that tf is simply the count of each word in each doc/sentence
    (and so, not normalized), and 1 is added to all idf values.
    To meet the requirements, for each term, we would need to subtract
    its tf (as described above) and then divide by the maximal tf in
    that doc/sentence. I think we need to use a CounterVectorizer first
    !!!
    '''
    vectorizer = CustomVectorizer(input='content',
                                  stopwords=list(stopwords.words('english')))
    vectors = vectorizer.fit(sentences)
    vectors = vectorizer.transform_tfidf(sentences)
    '''
    Transform the document into a single sentence and use
    the vectorizer to model it in the same feature space.
    '''
    docVector = vectorizer.transform_tfidf([doc])
    '''
    For each sentence vector, reduce the document vector
    to the same dimension space, to be able to compute
    the dot product -> similarity
    '''
    sim = []
    for vector in vectors:
        sim.append(similarity(vector, docVector[0]))

    summary = sorted(enumerate(sim), key=lambda s: s[1], reverse=True)[:5]
    summary.sort()
    '''
    Returns the list of selected sentences
    '''
    res = []
    for s in summary:
        res.append(sentences[s[0]])
    return res
text = file.read()
sents = sent_tokenize(text)

file.close()

vectorizer = CustomVectorizer(stopwords=stopwords.words())

vectorizer.fit(sents)
vecs = vectorizer.transform_tfidf(sents)

graph = {i: [] for i in range(len(vecs))}

threshold = 0.1
for i in range(len(vecs)):
    for j in range(i + 1, len(vecs)):
        if similarity(vecs[i], vecs[j]) > threshold:
            graph[i].append(j)
            graph[j].append(i)

graph = {k: list(set(graph[k])) for k in graph.keys()}

rank, i = e1.page_rank_mod(graph)

summary = sorted(rank.keys(), key=lambda k: rank[k], reverse=True)[:5]
summary.sort()

print('(done)')

print('\nBuilding html page...')
generateHTML(sents, summary)
print('(done)')
Esempio n. 6
0
        vectorizer.fit(sents)

        vecs = vectorizer.transform_tfidf(sents)
        source_score = vectorizer.transform_tfidf([source])[0]

        graph = defaultdict(lambda: [])

        weights_tfidf = defaultdict(lambda: {})
        weights_alternative = []  # TODO

        # Build graph
        threshold = 0.1
        for i, v1 in enumerate(vecs):
            for j, v2 in enumerate(vecs[i + 1:], start=i + 1):
                sim = similarity(v1, v2)
                if sim > threshold:
                    graph[i].append(j)
                    graph[j].append(i)

                    weights_tfidf[i][j] = sim
                    weights_tfidf[j][i] = sim

        l = len(graph.keys())
        n = sum(l / (i + 1) for i in range(l))

        # priors must be a probability distribution -> sum equals 1
        priors_pos = {k: (l / (k + 1)) / n for k in graph.keys()}
        priors_tfidf = {
            k: similarity(vecs[k], source_score)
            for k in graph.keys()