예제 #1
0
def search_news():
    # 获取查询字段
    query_line = request.args.get("query")
    # 返回url
    data = TextRank.Get_sample_news(query_line, url_list, dictionary,
                                    tfidf_vectors)
    return jsonify(data)
예제 #2
0
def main(url, log):
    log1 = log
    while True:
        if not os.path.exists(log1):
            os.makedirs(log1)
            break
        else : log1 = log1+'_'
    content = content_extractor.get_content(url)

    logFile = open(log1+'/metadata',"a")
    logFile.write("URL : "+url+"\n\n")
    logFile.write("Title : "+content['title']+"\n\n")
    logFile.write("Meta Desc. : "+content['meta']+"\n\n")
    logFile.write("Content : "+content['content'].encode("utf8")+"\n\n")
    logFile.close()

    d = TextRank.text_rank(content['content'])
    sortd = sorted(d.iteritems(), key = operator.itemgetter(1), reverse=True)

    logtext = open(log1+'/textrank_result',"a")
    logtext.write(str(sortd))
    logtext.close()

    final=[]
    for i in sortd:
        for j in range(len(source_probs)):
            final.append((i[0], j, i[1]*source_probs[j]))

    fsort = sorted(final, key = operator.itemgetter(2), reverse=True)
    logres = open(log1+'/result',"a")
    logres.write(str(fsort))
    logres.close()
    server.run_server(fsort[:10])
def get_keyphrases_textrank():
    text = request.form['textrank_text']
    top_n = request.form['top_n_textrank']
    top_keywords = TextRank.extractKeyphrases(text, int(top_n))
    context = dict()
    context['keywords'] = top_keywords
    return render_template("keyword_textrank.html", **context)
예제 #4
0
 def init(self):
     self.__linkDict = dict()
     self.__sentenceDict = dict()
     self.__keywordDict = dict()
     self.__distanceDict = dict()
     self.__validation = Validation.Validation()
     self.__validation.init_dic()
     self.__validation.init_base_normalized()
     self.__sentenceTokenizer = TextRank.SentenceTokenizer()
예제 #5
0
def get_keyphrases_textrank():
    filename = request.form['textrank_name']
    top_n = request.form['top_n_textrank']
    surveys = pd.read_excel(filename, header=0)
    col_name = request.form['textrank_question']
    text = ""
    col = surveys[col_name]
    for i in range(len(col)):
        text = text + " " + col[i]
    top_keywords = TextRank.extractKeyphrases(text, int(top_n))
    context = dict()
    context['keywords'] = top_keywords

    return render_template("keyword_textrank.html", **context)
예제 #6
0
    def base_vectorize(self, index, link):
        try:
            Basesummarizes = []
            print(link)
            textrank = TextRank.TextRank(link)

            summarizes = textrank.summarize(10)
            keywords = textrank.keywords()

            for sentence in summarizes:
                Basesummarizes.append(sentence)

            for sentence in textrank.sentences:
                for word in sentence.split(" "):
                    if word in self.__keyword:
                        Basesummarizes.append(sentence)
                        break

            flag = 0
            for keyword in keywords:
                if keyword in self.__keyword:
                    flag = 1
                    break

            if flag == 0:
                print("검색어가 키워드에 없습니다.")
                return

            self.__validation.sum_str(
                self.__sentenceTokenizer.get_nouns(Basesummarizes))
            self.__validation.set_dic(index, 0)
        except Exception as e:
            print(e)
            print('textrank not working')
            return

        self.printCommand(index, link, summarizes, keywords)

        self.__linkDict[index] = link
        self.__sentenceDict[index] = summarizes
        self.__keywordDict[index] = keywords

        self.__distanceDict = self.__validation.get_dic()

        self.__observer.resultToGui()
예제 #7
0
def summarymain(domain, prodID, choice, ch_token, token=4):

    summary = ""

    if choice == 1:
        rankedText = TextRank.summaryGen(prodID, domain, debugging=True)

    if choice == 2:
        if ch_token == "y" or ch_token == "yes":
            rankedText = TFIDFSummary.summaryGen(prodID,
                                                 domain,
                                                 gram=token,
                                                 debug=True)
        else:
            rankedText = TFIDFSummary.summaryGen(prodID, domain, debug=True)

    return rankedText
    '''keys=keywords.extract_keywords(domain,prodslist[ch])
예제 #8
0
def textRank(inpath,outpath):
    string = codecs.open(inpath, 'r', 'utf-8',errors='ignore').read()
    textrank_results = TextRank.extractKeyphrases(string)
    sorted_keywords = sorted(textrank_results.items(), key=lambda x: x[1], reverse=True)
    print(sorted_keywords)
    outString=''
    for i in range(len(sorted_keywords)):
        print(sorted_keywords[i])
        print(sorted_keywords[i][0])
        print(sorted_keywords[i][1])
        outString+=sorted_keywords[i][0]
        outString+=':'
        outString+=str(sorted_keywords[i][1])
        outString+='\n'

    # out_path='output/5AbstractsGroup-test1/Business/0401.txt'
    with open(outpath,'w',encoding='utf-8') as f:
        f.write(outString)
예제 #9
0
def ezLaunchDoc(selection):
    """Uses the Docs API.
        Makes a new document with a summery from POST request.
        """
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists(os.path.join(THIS_FOLDER, 'token.pickle')):
        with open(os.path.join(THIS_FOLDER, 'token.pickle'), 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                os.path.join(THIS_FOLDER, 'credentials.txt'), SCOPES)
            creds = flow.run_local_server()
        # Save the credentials for the next run
        with open(os.path.join(THIS_FOLDER, 'token.pickle'), 'wb') as token:
            pickle.dump(creds, token)

    service = build('docs', 'v1', credentials=creds)

    # Retrieve the documents contents from the Docs service.
    # document = service.dtitle = 'My Document'
    summs = eZ.ezRank(selection)
    body = {
        'title' : "New Summary"
    }
    requests = [
        {
            'insertText':{
                'location':{
                    'index':1,

                },
                'text':"\n" + summs
            }
        }
    ]
    document = service.documents().create(body=body).execute()
    result = service.documents().batchUpdate(documentId=document.get("documentId"), body={'requests':requests}).execute()
예제 #10
0
    def run(self):
        # 读取语料库,运行摘要程序
        doc, index = dataset.read_sogou(1)

        # debug module
        # doc, index = dataset.read_test()
        summary = tr.summary(doc, index)

        # delete the text
        self.text_answer.delete(1.0, tk.END)
        self.text_summary.delete(1.0, tk.END)
        self.result_text.delete(1.0, tk.END)

        # fill the text with the doc, summary
        self.fill_doc(doc)
        self.fill_summary(summary)

        self.window.update()  # show the result
        self.write_file_name = None
        self.write_flag = False
        return
예제 #11
0
def textrank(init_prob, strings, log):
    """
        Returns the updated priorities based on initial priorities ('init_probs') and list of strings ('strings').
        Uses text rank to do so.
        'log' is the name of file in which log should be saved.
    """
    d = {}
    for j in range(len(strings)):                   #Merging the outputs of text rank applied on each string in strings.
        d1 = TextRank.text_rank(strings[j])
        for i in d1.keys():
            temp = i.lower()
            if d.has_key(temp) :
                d[temp][j] = d[temp][j] + d1[i]
            else :
                d[temp] = np.zeros(len(strings))
                d[temp][j] = d1[i]
    source_probs = update(init_prob, dict2term_doc_matx(d), 0, log+"_update_results")
    sorted_dict = sorted(d.iteritems(), key = lambda x: x[1].dot(np.array(source_probs)), reverse=True)
    logFile = open(log+'_sorted_dict','a')
    print >>logFile, sorted_dict
    logFile.close()
    return source_probs
예제 #12
0
    def target_vectorize(self, targetIndex, targetLink):
        try:
            textrank = TextRank.TextRank(targetLink)
            summarizes = textrank.summarize(10)
            keywords = textrank.keywords()

            flag = 0
            for keyword in keywords:
                if keyword in self.__keyword:
                    flag = 1
                    break

            if flag == 0:
                print("검색어가 키워드에 없습니다.")
                return

            self.__validation.target_vectorizing(
                self.__sentenceTokenizer.get_nouns(summarizes))

            distance = self.__validation.dist_norm()

            if math.isnan(distance) == True:
                raise ValueError

            self.__validation.set_dic(targetIndex, distance)
        except:
            print('textrank not working')
            return

        self.printCommand(targetIndex, targetLink, summarizes, keywords,
                          distance)

        self.__linkDict[targetIndex] = targetLink
        self.__sentenceDict[targetIndex] = summarizes
        self.__keywordDict[targetIndex] = keywords
        self.__distanceDict = self.__validation.get_dic()

        self.__observer.resultToGui()
예제 #13
0
 def test_summary_gen(self):
     self.assertEqual(TextRank.summaryGen("abc.txt", "abc"),
                      "Domain not in dataset")
def summarize_with_TextRank(sentences, matrix):
    return TextRank.extractSentencesFromSentenceTokens(sentences)
def main(dom_choice, domain_list):
    if (dom_choice > len(domain_list)):
        print "Wrong choice"
        return "Wrong choice"
    domain = domain_list[dom_choice - 1]
    f = open("../datasets/Brands/" + domain.lower() + ".pickle", 'rb')
    object_file = pickle.load(f)
    prodslist = {}
    c = 0
    brandslist = {}
    prodslist = {}
    for brand in object_file.keys():
        #brand.append(line.split('|')[0])
        brandslist[c + 1] = brand
        print str(c + 1) + ". " + brand + "\n"
        c += 1

    print "Enter your choice"
    ch = int(raw_input())
    #ch=ch-1
    selectedBrand = brandslist[ch]
    print selectedBrand
    c = 0
    for prods in range(len(object_file[selectedBrand])):
        for prod in object_file[selectedBrand][prods].keys():
            prodslist[c + 1] = object_file[selectedBrand][prods][prod]
            print str(c + 1) + ". " + prod + "\n"
        c += 1

    print "Enter your choice"
    ch = int(raw_input())
    #ch=ch-1
    print "1.Summary using Text Rank"
    print "2.Summary using TF-IDF"
    print "Enter your choice"
    choice = int(raw_input())

    summary = ""

    if choice == 1:
        print "Do you want to enable debugging (Y/N)?"
        ch_debug = raw_input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            rankedText = TextRank.summaryGen(prodslist[ch],
                                             domain,
                                             debugging=True)
        else:
            rankedText = TextRank.summaryGen(prodslist[ch], domain)

        f.close()
        sleep(3)
        #rankedText=rankedText[:len(rankedText)/3]

    if choice == 2:
        print "Do you want to enable debugging (Y/N)?"
        ch_debug = raw_input().lower()
        print "Do you want to enter the token size (Y/N)?"
        ch_token = raw_input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            if ch_token == "y" or ch_token == "yes":
                print "Enter token size"
                token = int(raw_input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token,
                                                     debug=True)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     debug=True)
        else:
            if ch_token == "y" or ch_token == "yes":
                print "Enter token size"
                token = int(raw_input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain)

    keys = keywords.extract_keywords(domain, prodslist[ch])
    rankedSummary = ""
    for i in range(len(rankedText)):
        rankedSummary += rankedText[i]
    stopwords = load_stop_words("../stoplist.txt")
    tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE)
    tokens = tokenizer.tokenize(rankedSummary)
    tokens = [token for token in tokens if token.lower() not in stopwords]
    precision = float(len(set(tokens).intersection(set(keys)))) / float(
        len(tokens))
    recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys))
    fmeasure = 2 * (precision * recall) / (precision + recall)
    print "\n\n"
    print "Precision =", precision
    print "Recall =", recall
    print "F-Measure =", fmeasure
예제 #16
0
"""
author: Tongxin Wong, Jie Liu
create time: 2020-07-19
update time: 2020-07-24
"""

from flask import Flask, request, jsonify
import web
import TextRank
import opinion_perception

app = Flask(__name__)

# 加载字典
# 加载模型
dictionary, tfidf_vectors = TextRank.load_source()
# 加载url列表
url_list = TextRank.get_url_list()


@app.route('/api/hot_news', methods=['POST'])
def hot_news():
    received_data = request.get_json()
    tag_values = received_data['tag_values']
    data = web.get_hot_news(tag_values)
    return jsonify(data)


@app.route('/api/news_content', methods=['POST'])
def news_content():
    received_data = request.get_json()
    except:
        print("Input must be a natural number 0-100!")
        continue

    if not (compression > 0 and compression < 100):
        print("Out of bounds, try again")

num_of_sentences = int((compression/100) * article_dict["LENGTH"])
if num_of_sentences == 0:
    print("The desired compression rate for this article resulted in a zero sentence summary. Please try"
          " again with a higher rate of compression")
    exit()

edmundson = Edmundson(article_dict)
rhetoric = ExtractedArticle(article_dict)
textrank = TextRank(article_dict["BODY"])

master_scores =  list(map(sum, zip(edmundson.get_sent_scores(custom_settings),rhetoric.get_sent_scores(custom_settings),textrank.get_sent_scores())))
preliminary_indices = sorted(range(len(master_scores)), key=lambda i: master_scores[i])[-(num_of_sentences):]
master_indices = sorted(preliminary_indices)


print("Display Summary: \n")

for index in master_indices:
    print(article_dict["BODY"][index])
    summary += article_dict["BODY"][index]

summary = summary + "\n\nThis summary was generated using: " + active_pickle_file + "\n" + "Source shrunk from " + str(article_dict['LENGTH']) + ' sentences to ' + str(num_of_sentences) + " sentences" + " (" + str(compression) + "%)"

os.chdir(Summarypath)
예제 #18
0
nerInPyltp = loadNerDictFromPyltp('pyltp_savebox.txt')

partOfSpeechDict = loadWordsPartOfSpeech("spdict.txt")
nerDict = loadPreTrainEntityDict('lexiconAndNerDictWithInfo.txt')

# 打开训练数据集
f = codecs.open("coreEntityEmotion_train.txt", 'r', 'utf-8')

# 设置输出文件
outputname = "entityOutPut_originCut-pyltp_full_v3"
fout = codecs.open(outputname + ".txt", 'w', 'utf-8')
fout_cache = codecs.open(outputname + "_datacache.txt", 'w', 'utf-8')

#加载TextRank
trDemo = TextRank.TextRank()

# 分析过程
i = 0
for rawline in f.readlines():
    # 按行分析
    rawline_json = json.loads(rawline)
    # 获取标题行
    titleline = rawline_json['title']
    # 获取实体
    entity = set()
    eec = rawline_json["coreEntityEmotions"]
    for key in eec:
        entity.add(key["entity"])
    # 获取标题分词
    titleWords = segmentor.segment(titleline)
 def runTest(self):
     print "\nRunning TextRank Summary test cases...\n"
     self.assertEqual(TextRank.summaryGen("abc.txt", "abc"),
                      "Domain not in dataset")
예제 #20
0
__author__ = 'nikhil'
import tfidf
import TextRank, utilities_tests


ltf = utilities_tests.test_corpus_probs_update(["test_corpus_file_1.txt", "test_corpus_file_2.txt","test_corpus_file_3.txt"], mode_of_operation = 2, return_term=0)
atf = utilities_tests.test_corpus_probs_update(["test_corpus_file_1.txt", "test_corpus_file_2.txt","test_corpus_file_3.txt"], mode_of_operation = 3, return_term=0)
text = open("test_corpus_file_1.txt").read()
tr_list = TextRank.text_rank(text)
print ltf
print atf
print tr_list

#---- Imported TextRank module ----#

#
#print(tr_list)
#for k,g in tr_list:
#    print k,g
예제 #21
0
        reviewNo += 1
"""
Notes on using Gensim:
- reviews are already pretty short. Stripping stopwords can reduce sentences to too
short a corpus to discern (and train) the topic. That's why many generated topics
are not making sense.
- ideally, a corpus (sentence or more) should contain several words.
"""

# if not TEXTRANK:

# 	import gensim
# 	import hw2module as LDA

# 	# run preprocess(), which takes a list of words (sentence) and removes
# 	# all punctuation and stopwords from each word, returning the same structure.
# 	preprocessed_sentences_raw = [LDA.preprocess(s) for s in sentences]

# 	# create a gensim dictionary, save it to file
# 	gdict = LDA.saveInitialDictionary(preprocessed_sentences_raw)

# 	# experiment with number of topics
# 	LDA.make_and_show_lda_model(sentences, gdict, 15, show_docs = True)
import TextRank as tr

for asin, reviewlist in reviews.items():
    print("********* " + asin + " **********")
    for scoring in tr.score_keyphrases_by_textrank(' '.join(reviewlist),
                                                   n_keywords=0.25):
        print(scoring)
    print()
예제 #22
0
def cal_textrank(window, alpha):
    # with open('停用词表.txt', 'r', encoding='utf-8') as ban:
    #     banlist = ban.read().splitlines()
    win = int(window)
    alpha = float(alpha)
    with open('./original/corpus1.txt', 'r', encoding='utf-8') as f:
        s = f.read().replace('\n', '').strip()
        tr = TextRank(s, win, alpha, 700)
        tr.cutSentence()
        tr.createNodes()
        tr.createMatrix()
        tr.calPR()
        tr.output_matrix()
        res = tr.printResult()
    textrank = ''
    for item in res:
        # if item[0].strip() in banlist:
        #     continue
        s = str(tr.word_index[item[0]])+','+str(item).replace('(','').replace(')','').replace('\'','')+'\n'
        textrank+=s
    with open('./textrank.txt', 'w', encoding='utf-8') as w:
        w.write(textrank)
	#ch=ch-1
=======
	#ch=ch-1	
>>>>>>> 9391fcbd2c85c5808aaea53b20cf39fecc5707fa
	print "1.Summary using Text Rank"
	print "2.Summary using TF-IDF"
	print "Enter your choice"
	choice=int(raw_input())

	summary=""

	if choice==1:
		print "Do you want to enable debugging (Y/N)?"
		ch_debug=raw_input().lower()
		if ch_debug=="y" or ch_debug=="yes":
			rankedText = TextRank.summaryGen(prodslist[ch],domain,debugging=True)
		else:
<<<<<<< HEAD
			rankedText = TextRank.summaryGen(prodslist[ch],domain)


=======
			rankedText = TextRank.summaryGen(prodslist[ch],domain)	

	
>>>>>>> 9391fcbd2c85c5808aaea53b20cf39fecc5707fa
		f.close()
		sleep(3)
		#rankedText=rankedText[:len(rankedText)/3]

	if choice==2:
def main(dom_choice, domain_list):
    if (dom_choice > len(domain_list)):
        print("Wrong choice")
        return "Wrong choice"
    domain = domain_list[dom_choice - 1]
    object_file = load(BRANDS_PARSED_PATH + '/' + domain.lower() + ".npz",
                       allow_pickle=True)
    object_file = object_file['arr_0'].tolist()
    print(object_file)
    prodslist = {}
    c = 0
    brandslist = {}
    prodslist = {}
    for brand in object_file.keys():
        brandslist[c + 1] = brand
        print(str(c + 1) + ". " + brand + "\n")
        c += 1

    print("Enter your choice")
    ch = int(input())
    selectedBrand = brandslist[ch]
    print(selectedBrand)
    c = 0
    for prods in range(len(object_file[selectedBrand])):
        for prod in object_file[selectedBrand][prods].keys():
            prodslist[c + 1] = object_file[selectedBrand][prods][prod]
            print(str(c + 1) + ". " + prod + "\n")
        c += 1

    print("Enter your choice")
    ch = int(input())
    print("1.Summary using Text Rank")
    print("2.Summary using TF-IDF")
    print("Enter your choice")
    choice = int(input())

    summary = ""

    if choice == 1:
        print("Do you want to enable debugging (Y/N)?")
        ch_debug = input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            rankedText = TextRank.summaryGen(prodslist[ch],
                                             domain,
                                             debugging=True)
        else:
            rankedText = TextRank.summaryGen(prodslist[ch], domain)

        sleep(3)

    if choice == 2:
        print("Do you want to enable debugging (Y/N)?")
        ch_debug = input().lower()
        print("Do you want to enter the token size (Y/N)?")
        ch_token = input().lower()
        if ch_debug == "y" or ch_debug == "yes":
            if ch_token == "y" or ch_token == "yes":
                print("Enter token size")
                token = int(input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token,
                                                     debug=True)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     debug=True)
        else:
            if ch_token == "y" or ch_token == "yes":
                print("Enter token size")
                token = int(input())
                rankedText = TFIDFSummary.summaryGen(prodslist[ch],
                                                     domain,
                                                     gram=token)
            else:
                rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain)

    keys = keywords.extract_keywords(domain, prodslist[ch])
    rankedSummary = ""
    for i in range(len(rankedText)):
        rankedSummary += rankedText[i]
    stopwords = load_stop_words("../stoplist.txt")
    tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE)
    tokens = tokenizer.tokenize(rankedSummary)
    tokens = [token for token in tokens if token.lower() not in stopwords]
    precision = float(len(set(tokens).intersection(set(keys)))) / float(
        len(tokens))
    recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys))
    fmeasure = 2 * (precision * recall) / (precision + recall)
    print("\n\n")
    print("Precision =", precision)
    print("Recall =", recall)
    print("F-Measure =", fmeasure)
		if not sentence_raw: break
		category_raw = inf.readline()[:-1]
		if category_raw == "\n":
			category_raw = None # indicates that no category was assigned.
		else:
			categorized_sentences[category_raw] += " " + sentence_raw
		sentences_tagged.append((sentence_raw, category_raw))

# list of all sentences that have assigned categories. These sentences are 
# converted into lists, split by whitespace.
split_sentences_raw = [x[0].split() for x in sentences_tagged if x[1] is not None]

if not TEXTRANK:

	# run preprocess(), which takes a list of words (sentence) and removes
	# all punctuation and stopwords from each word, returning the same structure.
	preprocessed_sentences_raw = [LDA.preprocess(s) for s in split_sentences_raw]

	# create a gensim dictionary, save it to file
	gdict = LDA.saveInitialDictionary(preprocessed_sentences_raw)

	# experiment with number of topics
	LDA.make_and_show_lda_model(preprocessed_sentences_raw, gdict, 20, show_docs = True)

else:
	import TextRank as tr
	for category, conjoined_sentences in categorized_sentences.items():
		print("********* " + category + " *********")
		for scoring in tr.score_keyphrases_by_textrank(conjoined_sentences):
			print(scoring)
		print()