/
NewTextSummarizer.py
155 lines (147 loc) · 5.25 KB
/
NewTextSummarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from django.contrib.auth.decorators import login_required
from django.views.decorators.csrf import csrf_exempt
from django.http import HttpResponse, JsonResponse
from django.core import serializers
from django.core.serializers.json import DjangoJSONEncoder
import logging
import base64
import json
try:
import urllib.parse as urllib
except ImportError:
import urllib
#import httplib2
import os
import uuid
from django.core import serializers
from django.http import Http404, HttpResponse, QueryDict
from django.shortcuts import render
from rest_framework import exceptions, filters, generics, status, viewsets
from rest_framework.decorators import detail_route, api_view
from rest_framework.response import Response
#from models import Metric
#from serializers import MetricSerializer
import logging
import datetime
logging.basicConfig()
logger = logging.getLogger(__name__)
user=''
password=''
#http = httplib2.Http(".cache", disable_ssl_certificate_validation=True)
#http.add_credentials(user, password)
#auth = base64.encodestring(user + ':' + password)
from gensim.summarization import summarize
def add(request):
logger.info('add metric='+str(request.POST))
#logger.info('text='+str(request.data.get('text','')))
text = request.POST.get('text','')
#msgType = None
#msgType = request.META.get('HTTP_x_amz_sns_message_type')
#if msgType == 'Notification':
# #add metric
# pass
#from textselector import test_build_counts, test_find_closest
#text = get_text()
#counts = test_build_counts(text)
#closest = test_find_closest(text, counts)
#closest = text.splitlines()[0:1]
#summary = '\n'.join(closest)
text = text.split('.')
text = '\n'.join(text)
try:
logger.info('text='+repr(text))
summary = summarize(text)
if summary:
pass
else:
summary = ''.join(text.splitlines()[0:1])
except Exception as e:
summary = str(e)
if type(e).__name__ == "TypeError":
summary = ''.join(text.splitlines()[0:1])
logger.info('summary='+repr(summary))
return HttpResponse(json.dumps({'status':'success',
'msg':'added',
'summary':summary
}))
import textract
def upload(request):
name = request.POST.get('name','')
import urllib.parse
name = urllib.parse.unquote(name)
text = [""]
path = os.path.dirname(os.path.abspath(__file__))+"/"+"../../../../ui/summarizer/public/uploads/"
try:
h = ''
if name:
print('opening file='+path+name)
h = textract.process(path+name)
if h:
text = [x.strip() for x in h.decode('utf-8').splitlines()]
text = '\n'.join(text)
#print(text)
else:
with open(path+"/"+name, 'r') as fin:
text = fin.readlines()
text = '\n'.join(text)
summary = ""
if text:
#logger.info('text='+repr(text))
summary = summarize(text)
if summary:
print("summarized:"+name)
pass
else:
summary = ''.join(text.splitlines()[0:1])
except Exception as e:
summary = str(e)
if type(e).__name__ == "TypeError":
summary = ''.join(text.splitlines()[0:1])
#logger.info('summary='+repr(summary))
return HttpResponse(json.dumps({'status':'success',
'msg':'added',
'summary':summary
}))
def test(request):
return HttpResponse('', status=200)
pass
def word_similarity_graph(words):
import networkx as nx
G = nx.Graph() #undirected
G.add_nodes_from(words)
for i in range(len(words)):
for j in range(len(words)):
if i == j:
continue
word_1 = words[i]
word_2 = words[j]
weight_i_j = word2vec.similarity(word_1, word_2)
if weight_i_j < WEIGHT_THRESHOLD:
continue
G.add_edge(word_1, word_2, weight_i_j)
return G
def pagerank_summarize(self, text):
from gensim.models import word2vec
# load the test sample
sentences = text.split('\n')
model = word2vec.Word2Vec(sentences, size=200)
model.save_word2vec_format('/tmp/vectors.bin', binary=True)
model = word2vec.Word2Vec.load_word2vec_format('/tmp/vectors.bin', binary=True)
G = self.word_similarity_graph(model)
pr = nx.pagerank(G,tol=1e-10)
#get selection from text based on pagerank of words
import operator
sorted_pr = sorted(pr.items(), key=operator.itemgetter(1), reverse=True)
important = [int(i[0]) for i in sorted_x][:10]
scored_sentences = {}
for sentence in sentences:
matches = set(sentence.split()).intersection(important)
score = 0
for match in matches:
score+= pr[match]
scored_sentences[sentence]=score
reordered_sentences = [ i[0] for i in sorted(scored_sentences.items(), key=operator.itemgetter(1), reverse=True)[:10] ]
ordered_sentences = [ x for x in sentences if x in reordered_sentences ]
summary = '\n'.join(ordered_sentences)
#print(summary)
return summary