forked from Divyansh00/Scientific-Content-Categorization
/
hello.py
142 lines (110 loc) · 4.31 KB
/
hello.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from __future__ import print_function
import flask
from flask import Flask, render_template , request
from sklearn.externals import joblib
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
import rake
import operator
import six
import mechanicalsoup
from bs4 import BeautifulSoup
import requests
app = Flask(__name__)
@app.route("/")
@app.route("/index")
def index():
return flask.render_template('index.html')
def get_simple_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def clean_text(words):
output_words = []
stop = stopwords.words('english')
stop += list(string.punctuation)
lemmatizer = WordNetLemmatizer()
for w in words:
if w.lower() not in stop:
pos = pos_tag([w])
clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
output_words.append(clean_word.lower())
return output_words
@app.route('/predict', methods=['POST'])
def make_prediction():
if request.method=='POST':
#get uploaded document
file = request.files['uploaded_file']
if not file: return render_template('index.html',label="No file uploaded")
test = pd.read_csv(file)
line = list()
for i in test[test.columns.tolist()]:
line.append(i)
X_test = ''
X_test = X_test.join(line)
text = word_tokenize(X_test)
#text = ''.join(text)
cleaned_text = clean_text(text)
joined_text = " ".join(cleaned_text)
rake_object = rake.Rake('stopwords.txt')
sentenceList = rake.split_sentences(joined_text)
stopwords = rake.load_stop_words('stopwords.txt')
stopwordpattern = rake.build_stop_word_regex('stopwords.txt')
phraseList = rake.generate_candidate_keywords(sentenceList, stopwordpattern, stopwords)
wordscores = rake.calculate_word_scores(phraseList)
keywordcandidates = rake.generate_candidate_keyword_scores(phraseList, wordscores)
sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
totalKeywords = len(sortedKeywords)
for keyword in sortedKeywords[0:int(5)]:
print("Keyword: ", keyword[0], ", score: ", keyword[1])
x_test_mat = weight_model.transform(joined_text.split('.'))
predict = model.predict(x_test_mat)
print(predict)
label = str(np.squeeze(predict[0]))
print(label)
#read_dict = np.load('final.npy').item()
#id = read_dict[label]["doi"]
#title = read_dict[label]["title"]
#recommend = zip(id, title)
browser = mechanicalsoup.StatefulBrowser()
q = label
browser.open("https://www.scimagojr.com/journalsearch.php?q="+q)
soup = browser.get_current_page()
#print(soup.prettify())
#soup = BeautifulSoup(open("C:/Users/divya/Desktop/crawl.txt").read())
divTag = soup.find_all("div", {"class" : "search_results"})
l = len(divTag)
divTag = str(divTag)
#print((divTag.split('</a>\n')[0]))
recommend = []
for i in range(1,6):
s = divTag.split('</a>\n')[i]
a = s.split('>')
#print(a)
b = a[0].split('"')
link = b[1]
c = a[2].split('<')
#print(c)
title = c[0]
recommend.append((title,"https://www.scimagojr.com/"+link))
recommend = [(y,x.replace('amp;', '')) for y,x in recommend]
print(recommend)
return render_template('index.html', label=label, keyword=sortedKeywords[0:int(5)], recommendations=recommend)
if __name__ == '__main__':
weight_model = joblib.load('weight_model.pkl')
model = joblib.load('hacksvmmodel.pkl')
app.run(host='127.0.0.1', port=8000, debug=True)