Ejemplo n.º 1
0
def get_string(img_path):
    # Read image with opencv
    img = cv2.imread(img_path)

    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)

    # Write image after removed noise
    cv2.imwrite(src_path + "removed_noise.png", img)

    #  Apply threshold to get image with only black and white
    #img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)

    # Write the image after apply opencv to do some ...
    cv2.imwrite(src_path + "thres.png", img)

    # Recognize text with tesseract for python
    result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))

    # Remove template file
    #os.remove(temp)

    #return result
    Summarizer.main(result)
Ejemplo n.º 2
0
 def __init__(self):
     Importer.ActivityWriter.__init__(self)
     self.summarizer = Summarizer.Summarizer()
     self.current_activity_id = None
     self.current_activity_type = None
     self.current_activity_start_time = None
     self.location_analyzer = None
     self.sensor_analyzers = []
Ejemplo n.º 3
0
	def __init__(self, url, keyword_limit, sentence_limit):
		"""
		Initialization function for this class.

		Parameters:
			1) url - url to visit to gather information
			2) keyword_limit - use this many keywords when using algorithm
			3) sentence_limit - floating percentange that limits the amount of sentences gathered
		"""

		self.url = url
		self.summary = Summarizer(url, keyword_limit, sentence_limit)
		self.soup = self.open_url()
		self.title = None
		self.author = None
		self.description = None
		self.url = None
Ejemplo n.º 4
0
def summarize():
    if request.headers['Content-Type'] != 'application/json':
         return "Please post a JSON"
    data = json.loads(json.dumps(request.json))
    text=data["text"].strip()
    resp=jsonify({"text":summarizer.summarize_text(text)})
    
    resp.headers.add('Access-Control-Allow-Origin', '*')
    return resp
Ejemplo n.º 5
0
    def __init__(self):

        self.WIDTH = 600
        self.HEIGHT = 800
        self.FONT = "helvetica"
        self.FONT_SIZE = 12

        # colours specified as RGB fractions
        self.bg_input = [1, 1, 1]
        self.fg_input = [0, 0, 0]

        self.bg_article = [0, 0, 0]
        self.fg_min_article = [0.5, 0.5, 0.5]
        self.fg_max_article = [0.9, 0.9, 0.9]
        self.fg_solution_article = [1, 1, 1]  #[0.3, 0.5, 1.0] #[1, 0.7, 0.4]

        invert = False
        if invert:
            self.bg_input = [1. - v for v in self.bg_input]
            self.fg_input = [1. - v for v in self.fg_input]

            self.bg_article = [1. - v for v in self.bg_article]
            self.fg_min_article = [1. - v for v in self.fg_min_article]
            self.fg_max_article = [1. - v for v in self.fg_max_article]

        self.text = ""  # what is shown in the box
        self.allText = ""  # the text for the entire article
        self.sentences = []  # list of sentences in article
        # dictionary mapping from size to k-hot encoding indicating
        # which sentences are in the summary
        self.solutions = []
        # (not used) how much weight is put on each sentence
        self.weights = []

        self.only_summary = True
        self.summary_size = 1
        self.summary_coherence = 0.0
        self.summary_independence = 0.8

        self.summarizer = Summarizer(parent=self)

        self.root = Tk()

        self.draw(init=True)
Ejemplo n.º 6
0
def processinput(story,headline):
    with open('w2iA.pickle','rb') as handle:
        word2ind=pickle.load(handle)
    with open('i2wA.pickle','rb') as handle:
        ind2word=pickle.load(handle)
    p_texts, p_summaries, w_counted = summarizer_data_utils.preprocess_texts_and_summaries(story,headline,keep_most=False)
    p_texts_clean = []
    p_summaries_clean = []

    #print(p_texts)

    #print(p_summaries)

    for t, s in zip(p_texts, p_summaries):
        if t != [] and s != []:
            p_texts_clean.append(t)
            p_summaries_clean.append(s)


    # converts words in texts and summaries to indices
    c_texts, unknown_w_in_texts = summarizer_data_utils.convert_to_inds(p_texts_clean,word2ind,eos = False)
    c_summaries, unknown_w_in_summaries = summarizer_data_utils.convert_to_inds(p_summaries_clean,word2ind,eos = True,sos = True)

    # model hyperparameters
    num_layers_encoder = 4
    num_layers_decoder = 4
    rnn_size_encoder = 300
    rnn_size_decoder = 300

    batch_size = 32
    epochs = 100
    clip = 5
    keep_probability = 0.8
    learning_rate = 0.0005
    max_lr=0.005
    learning_rate_decay_steps = 100
    learning_rate_decay = 0.90


    pretrained_embeddings_path = '/home/mitali/tf_hub_embedding_headlinesA.npy'
    summary_dir = os.path.join('/home/mitali/tensorboard/headlinesA')

    use_cyclic_lr = True
    inference_targets=True


    summarizer_model_utils.reset_graph()
    summarizer = Summarizer.Summarizer(word2ind,ind2word,'/home/mitali/models/headlines/my_modelA','INFER',num_layers_encoder = num_layers_encoder,num_layers_decoder = num_layers_decoder,batch_size = len(c_texts),clip = clip,keep_probability = 1.0,learning_rate = 0.0,beam_width = 5,rnn_size_encoder = rnn_size_encoder,rnn_size_decoder = rnn_size_decoder,inference_targets = False,pretrained_embeddings_path = pretrained_embeddings_path)
    summarizer.build_graph()
    preds = summarizer.infer(c_texts,restore_path =  '/home/mitali/models/headlines/my_modelA',targets = c_summaries)
    # show results
    at,acs,cs=summarizer_model_utils.sample_results(preds,ind2word,word2ind,c_summaries,c_texts)
    return at,acs,cs
Ejemplo n.º 7
0
def getModelApi():
    preprocessor = Preprocessing.Preprocessor()
    summarizer = Summarizer.Summarizer('./vocab', './')
    def preprocessorApi(article):
        tokenized=preprocessor.tokenize(article)
        tokenized=(' '.join(tokenized))
        return preprocessor.adjust_article(tokenized.split('*N*')).encode('utf-8')

    def modelApi(preprocessed_articles):
        return summarizer.summarize(preprocessed_articles)

    return preprocessorApi,modelApi
Ejemplo n.º 8
0
def cli_interface():
    print("""
    ____               _                       __  ___              __  
   / __ \ ___  _   __ (_)___  _      __ _____ /  |/  /____ _ _____ / /__
  / /_/ // _ \| | / // // _ \| | /| / // ___// /|_/ // __ `// ___// //_/
 / _, _//  __/| |/ // //  __/| |/ |/ /(__  )/ /  / // /_/ // /   / ,<   
/_/ |_| \___/ |___//_/ \___/ |__/|__//____//_/  /_/ \__,_//_/   /_/|_|  
                                                                        
<!--       _
       .__(.)< (Please, paste the URL to the reviews!)
        \___)   
 ~~~~~~~~~~~~~~~~~~-->""")
    url = input(">>> ")
    url = url.strip()
    reviews = amazon.get_all_reviews_in_all_pages(url, limit=25)
    # print("*{}*\n {}\n".format(review['Score'], review['Text']))
    prod_positives = [
        review['Text'] for review in reviews if review['Score'] == 5
    ]
    prod_negatives = [
        review['Text'] for review in reviews if review['Score'] <= 2
    ]

    s.get_summary(prod_positives, prod_negatives)
Ejemplo n.º 9
0
class TextBox:
    def __init__(self):

        self.WIDTH = 600
        self.HEIGHT = 800
        self.FONT = "helvetica"
        self.FONT_SIZE = 12

        # colours specified as RGB fractions
        self.bg_input = [1, 1, 1]
        self.fg_input = [0, 0, 0]

        self.bg_article = [0, 0, 0]
        self.fg_min_article = [0.5, 0.5, 0.5]
        self.fg_max_article = [0.9, 0.9, 0.9]
        self.fg_solution_article = [1, 1, 1]  #[0.3, 0.5, 1.0] #[1, 0.7, 0.4]

        invert = False
        if invert:
            self.bg_input = [1. - v for v in self.bg_input]
            self.fg_input = [1. - v for v in self.fg_input]

            self.bg_article = [1. - v for v in self.bg_article]
            self.fg_min_article = [1. - v for v in self.fg_min_article]
            self.fg_max_article = [1. - v for v in self.fg_max_article]

        self.text = ""  # what is shown in the box
        self.allText = ""  # the text for the entire article
        self.sentences = []  # list of sentences in article
        # dictionary mapping from size to k-hot encoding indicating
        # which sentences are in the summary
        self.solutions = []
        # (not used) how much weight is put on each sentence
        self.weights = []

        self.only_summary = True
        self.summary_size = 1
        self.summary_coherence = 0.0
        self.summary_independence = 0.8

        self.summarizer = Summarizer(parent=self)

        self.root = Tk()

        self.draw(init=True)

        #self.root.mainloop()

    def draw(self, init=False):

        if init:
            # show main article body
            self.tk_article = ScrolledText(self.root)

            # let user paste and enter text
            self.tk_user_input = ScrolledText(self.root)

            self.tk_summary_size_scale = Scale(self.root)
            self.tk_summary_size_scale_label = Label(self.root, text="Length")

            self.tk_summary_coherence_scale = Scale(self.root)
            self.tk_summary_coherence_scale_label = Label(self.root,
                                                          text="Coherence")

            self.tk_summary_independence_scale = Scale(self.root)
            self.tk_summary_independence_scale_label = Label(
                self.root, text="Independence")

            self.tk_toggle_view = Button(self.root,
                                         text="more",
                                         command=self.handleToggleView)
            self.tk_recalculate = Button(self.root,
                                         text="Update",
                                         command=self.handleRecalculate)

            self.root.geometry("%dx%d" % (self.WIDTH, self.HEIGHT))
            self.root.title("QuickReader V4")

            self.tk_article.configure(width=25,
                                      height=6,
                                      bd=0,
                                      highlightthickness=0,
                                      wrap="word",
                                      font=self.FONT)

            self.tk_user_input.configure(width=25,
                                         height=3,
                                         bd=0,
                                         highlightthickness=0,
                                         wrap="word",
                                         font=self.FONT)

            self.tk_summary_size_scale.configure(
                bd=0,
                from_=0,
                to=20,
                orient=HORIZONTAL,
                sliderrelief=FLAT,
                command=lambda event: self.handleSlider(
                    self.tk_summary_size_scale.get()))

            ######
            self.tk_summary_coherence_scale.configure(
                bd=0,
                from_=0,
                to=1,
                orient=HORIZONTAL,
                sliderrelief=FLAT,
                resolution=0.05,
                command=lambda event: self.handleCoherenceSlider(
                    self.tk_summary_coherence_scale.get()))

            self.tk_summary_coherence_scale.set(self.summary_coherence)

            ######
            self.tk_summary_independence_scale.configure(
                bd=0,
                from_=0,
                to=1.5,
                orient=HORIZONTAL,
                sliderrelief=FLAT,
                resolution=0.05,
                command=lambda event: self.handleIndependenceSlider(
                    self.tk_summary_independence_scale.get()))

            self.tk_summary_independence_scale.set(self.summary_independence)

            # set colours
            self.root.configure(background="black")

            self.tk_summary_size_scale.configure(troughcolor="#444444",
                                                 fg="black",
                                                 background="white",
                                                 activebackground="#bbbbbb")

            self.tk_summary_coherence_scale.configure(
                troughcolor="#444444",
                fg="black",
                background="white",
                activebackground="#bbbbbb")

            self.tk_summary_independence_scale.configure(
                troughcolor="#444444",
                fg="black",
                background="white",
                activebackground="#bbbbbb")

            self.tk_article.configure(bg=toHex(self.bg_article),
                                      fg="white",
                                      insertbackground="blue")
            self.tk_article.vbar.configure(bg="white",
                                           width=10,
                                           troughcolor="black")

            self.tk_user_input.configure(bg=toHex(self.bg_input),
                                         fg=toHex(self.fg_input),
                                         insertbackground="blue")
            self.tk_user_input.vbar.configure(bg="white",
                                              width=10,
                                              troughcolor="black")

            self.tk_user_input.focus()
            self.tk_user_input.bind("<KeyRelease-Return>",
                                    (lambda event: self.handleUserInput(
                                        self.tk_user_input.get("0.0", END))))
            self.root.bind("<Configure>", self.resize)

    def setText(self, text, redraw=False):
        self.text = text
        if redraw: self.updateArticleInfo()

    def setSentences(self, sentences, redraw=False):
        self.sentences = sentences
        if redraw: self.updateArticleInfo()

    def setSolutions(self, solutions, redraw=False):
        self.solutions = solutions
        if redraw: self.updateArticleInfo()

    def setWeights(self, weights, redraw=False):
        self.weights = weights
        if redraw: self.updateArticleInfo()

    def handleToggleView(self):

        print("View toggle!")

        self.only_summary = not self.only_summary

        if self.only_summary:
            self.tk_toggle_view.configure(text="more")
        else:
            self.tk_toggle_view.configure(text="less")

        self.updateSummary()

    def handleRecalculate(self):
        print("Update!")

        self.handleUserInput(self.allText)

    def handleSlider(self, value):

        print("Slider:", value)

        self.summary_size = value

        self.updateSummary()

    def handleCoherenceSlider(self, value):

        print("Coherence Slider:", value)

        self.summary_coherence = value

        #self.updateSummary()

    def handleIndependenceSlider(self, value):

        print("Independence Slider:", value)

        self.summary_independence = value

        #self.updateSummary()

    def updateSummary(self):

        l = self.summary_size

        if self.only_summary and l != 0:
            self.setText('\n\n'.join([
                self.sentences[i] for i in range(len(self.sentences))
                if self.solutions[l][i] == 1
            ]))
        else:
            self.setText(self.allText, redraw=False)

        self.updateArticleInfo()

        self.setWeights([0. for _ in self.sentences], redraw=True)

        self.tk_article.yview_moveto(0)  #vbar.set(0, 0) #configure(jump=0)

    def handleUserInput(self, inStr):
        self.tk_user_input.delete("0.0", END)

        if inStr.strip() == "": return

        text = inStr

        text = ''.join([ch for ch in text if ord(ch) < 128])

        self.setText(text, redraw=False)
        self.setSolutions([], redraw=False)
        self.setWeights([], redraw=True)

        text, sentences, solutions = self.summarizer.summarize(
            text,
            coherence_weight=self.summary_coherence,
            independence_weight=self.summary_independence,
            size_weight=1.,
            beam_width=3,
            hard_size_limit=None)

        self.allText = text
        self.sentences = sentences
        self.solutions = solutions

        self.solutions[0] = [1. for _ in sentences]

        # get max length for summary
        max_len = max(solutions.keys())
        set_len = min(max_len, 3)
        self.tk_summary_size_scale.configure(from_=0, to=max_len)
        self.tk_summary_size_scale.set(set_len)
        self.summary_size = set_len

        # text: all the text in one long string
        # sentences: the text split up into a list of sentences
        # solution: dictionary mapping summary size to a one-hot vector over the sentences, indicating
        #   which sentences are included in the summarization

        # the text should be the same, but update it anyways since it needs to contain the
        #  exact same stuff as the sentences

        self.updateSummary()

        #self.updateArticleInfo()

    def resize(self, event=[]):
        LINEH = 20.0

        pixelX = self.root.winfo_width()
        pixelY = self.root.winfo_height()

        bf = 5  # buffer size in pixels

        # update find_icon, wiki_icon, and graph_icon

        # set toggle and recalculate button
        toggleW = 50
        toggleH = 35 * 1
        self.tk_toggle_view.place(x=pixelX - toggleW,
                                  y=0,
                                  width=toggleW,
                                  height=toggleH)

        updateW = 50
        updateH = 35 * 2
        self.tk_recalculate.place(x=pixelX - updateW,
                                  y=toggleH,
                                  width=updateW,
                                  height=updateH)

        buttonH = toggleH + updateH

        labelW = 90

        # set position of size scale
        scaleW = pixelX - updateW - labelW
        scaleH = 35
        self.tk_summary_size_scale.place(x=labelW,
                                         y=0,
                                         width=scaleW,
                                         height=scaleH)

        self.tk_summary_size_scale_label.place(x=0,
                                               y=0,
                                               width=labelW,
                                               height=scaleH)

        # set position of coherence scale
        coherenceW = pixelX - updateW - labelW
        coherenceH = 35
        self.tk_summary_coherence_scale.place(x=labelW,
                                              y=scaleH,
                                              width=scaleW,
                                              height=scaleH)

        self.tk_summary_coherence_scale_label.place(x=0,
                                                    y=scaleH,
                                                    width=labelW,
                                                    height=coherenceH)

        # set position of independence scale
        independenceW = pixelX - updateW - labelW
        independenceH = 35
        self.tk_summary_independence_scale.place(x=labelW,
                                                 y=scaleH + coherenceH,
                                                 width=scaleW,
                                                 height=scaleH)

        self.tk_summary_independence_scale_label.place(x=0,
                                                       y=scaleH + coherenceH,
                                                       width=labelW,
                                                       height=independenceH)

        # update user input
        inputW = pixelX
        inputH = int(3.0 * LINEH)
        self.tk_user_input.place(x=0,
                                 y=pixelY - inputH,
                                 width=inputW,
                                 height=inputH)

        # update article
        articleW = pixelX
        articleH = pixelY - inputH - scaleH - coherenceH - independenceH
        self.tk_article.place(x=0,
                              y=scaleH + coherenceH + independenceH,
                              width=articleW,
                              height=articleH)

    def updateArticleInfo(self):

        self.articleClear()

        self.articleCat(self.text)

        if self.weights != []:
            self.articleColour()

        self.root.update()

    def articleClear(self):
        self.tk_article.delete("1.0", END)
        self.tk_article.update()

        self.root.update()

        return

    def articleCat(self, inStr):

        self.tk_article.insert(END, inStr)

        self.tk_article.yview(END)

    def articleColour(self):
        '''
		solution = self.solutions[self.summary_size]

		allText = self.text #self.tk_article.get('1.0', 'end-1c')

		# make sure weights are normalised
		maxW = max(self.weights)
		minW = min(self.weights)

		weights = self.weights
		if maxW != minW:
			weights = [(v-minW)/(maxW-minW) for v in self.weights]

		for i in range(len(self.sentences)):
			if self.only_summary and solution[i] != 1.: continue

			s = self.sentences[i]
			if len(s.strip()) == 0:

				continue

			tagNameA = ''.join([random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(10)])
			L_Size = 12 # if solution[i] == 1 else 10
			
			L_Colour = blend(self.fg_min_article, self.fg_max_article, weights[i])
			L_Colour = self.fg_solution_article if solution[i] == 1 else L_Colour

			countVar = StringVar(self.root)
			pos = self.tk_article.search(s, "1.0", stopindex="end", count=countVar)

			self.tk_article.tag_add(tagNameA, pos, "{} + {}c".format(pos, countVar.get()))

			bolding = "normal" #"bold" if self.solution[i] == 1 else "normal" #
			font = (self.FONT, L_Size, bolding)
			self.tk_article.tag_config(tagNameA, foreground=toHex(L_Colour), font=font)#self.FONT+' %s'%(L_Size))

		
		self.root.update()
		'''

        solution = self.solutions[self.summary_size]

        allText = self.text  #self.tk_article.get('1.0', 'end-1c')

        #print("=========")
        for i in range(len(self.sentences)):
            if self.only_summary and solution[i] != 1.: continue

            s = self.sentences[i]
            #if len(s.strip()) == 0:
            #	continue

            #print("- ", s)

            tagNameA = ''.join([
                random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(10)
            ])
            L_Size = self.FONT_SIZE  # if solution[i] == 1 else 10

            L_Colour = self.fg_solution_article if solution[
                i] == 1 else self.fg_min_article
            #print("\t", L_Colour)

            countVar = StringVar(self.root)
            pos = self.tk_article.search(s,
                                         "1.0",
                                         stopindex="end",
                                         count=countVar)

            self.tk_article.tag_add(tagNameA, pos,
                                    "{} + {}c".format(pos, countVar.get()))

            bolding = "normal"  #"bold" if self.solution[i] == 1 else "normal" #
            font = (self.FONT, L_Size, bolding)
            self.tk_article.tag_config(tagNameA,
                                       foreground=toHex(L_Colour),
                                       font=font)  #self.FONT+' %s'%(L_Size))

        self.root.update()
Ejemplo n.º 10
0
def main():

    file_path = './Data.csv'
    data = pd.read_csv(file_path)
    data.shape

    raw_texts = []
    raw_summaries = []

    for text, summary in zip(data.Text, data.Summary):
        if 100< len(text) < 2000:
            raw_texts.append(text)
            raw_summaries.append(summary)

    processed_texts, processed_summaries, words_counted = summarizer_data_utils.preprocess_texts_and_summaries(
        raw_texts,
        raw_summaries,
        keep_most=False
    )

    #for t,s in zip(processed_texts[:1], processed_summaries[:1]):
    #    print('Text\n:', t, '\n')
    #    print('Summary:\n', s, '\n\n\n')

    specials = ["<EOS>", "<SOS>","<PAD>","<UNK>"]
    word2ind, ind2word,  missing_words = summarizer_data_utils.create_word_inds_dicts(words_counted,
                                                                           specials = specials)
    print(len(word2ind), len(ind2word), len(missing_words))

    embed = hub.Module("https://tfhub.dev/google/Wiki-words-250/1")
    emb = embed([key for key in word2ind.keys()])

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        embedding = sess.run(emb)

    np.save('./tf_hub_embedding_headlines.npy', embedding)

    converted_texts, unknown_words_in_texts = summarizer_data_utils.convert_to_inds(processed_texts,
                                                                                    word2ind,
                                                                                    eos = False)

    converted_summaries, unknown_words_in_summaries = summarizer_data_utils.convert_to_inds(processed_summaries,
                                                                                            word2ind,
                                                                                            eos = True,
                                                                                            sos = True)

    #print( summarizer_data_utils.convert_inds_to_text(converted_texts[0], ind2word),
    #       summarizer_data_utils.convert_inds_to_text(converted_summaries[0], ind2word))

    # model hyperparameters
    num_layers_encoder = 4
    num_layers_decoder = 4
    rnn_size_encoder = 250
    rnn_size_decoder = 250

    batch_size = 10
    epochs = 2
    clip = 2
    keep_probability = 0.8
    learning_rate = 0.0005
    max_lr=0.005
    learning_rate_decay_steps = 100
    learning_rate_decay = 0.90


    pretrained_embeddings_path = './tf_hub_embedding_headlines.npy'
    summary_dir = os.path.join('./tensorboard/headlines')

    use_cyclic_lr = True
    inference_targets=True

    # build graph and train the model
    summarizer_model_utils.reset_graph()
    summarizer = Summarizer.Summarizer(word2ind,
                                       ind2word,
                                       save_path='./models/headlines/my_model',
                                       mode='TRAIN',
                                       num_layers_encoder = num_layers_encoder,
                                       num_layers_decoder = num_layers_decoder,
                                       rnn_size_encoder = rnn_size_encoder,
                                       rnn_size_decoder = rnn_size_decoder,
                                       batch_size = batch_size,
                                       clip = clip,
                                       keep_probability = keep_probability,
                                       learning_rate = learning_rate,
                                       max_lr=max_lr,
                                       learning_rate_decay_steps = learning_rate_decay_steps,
                                       learning_rate_decay = learning_rate_decay,
                                       epochs = epochs,
                                       pretrained_embeddings_path = pretrained_embeddings_path,
                                       use_cyclic_lr = use_cyclic_lr,)
    #                                    summary_dir = summary_dir)

    summarizer.build_graph()
    summarizer.train(converted_texts,
                     converted_summaries)

    summarizer_model_utils.reset_graph()
    summarizer = Summarizer.Summarizer(word2ind,
                                       ind2word,
                                       './models/headlines/my_model',
                                       'INFER',
                                       num_layers_encoder = num_layers_encoder,
                                       num_layers_decoder = num_layers_decoder,
                                       batch_size = len(converted_texts[:5]),
                                       clip = clip,
                                       keep_probability = 1.0,
                                       learning_rate = 0.0,
                                       beam_width = 5,
                                       rnn_size_encoder = rnn_size_encoder,
                                       rnn_size_decoder = rnn_size_decoder,
                                       inference_targets = False,
                                       pretrained_embeddings_path = pretrained_embeddings_path)

    summarizer.build_graph()
    preds = summarizer.infer(inputs = converted_texts[:5],
                             restore_path =  './models/headlines/my_model',
                             targets = converted_summaries[:5])

    # show results
    summarizer_model_utils.sample_results(preds,
                                          ind2word,
                                          word2ind,
                                          converted_summaries[:5],
                                          converted_texts[:5])
Ejemplo n.º 11
0
class Info:
	"""Class that handles the information for the article and is also responsible
	for obtaining the summary for the article.
	"""

	def __init__(self, url, keyword_limit, sentence_limit):
		"""
		Initialization function for this class.

		Parameters:
			1) url - url to visit to gather information
			2) keyword_limit - use this many keywords when using algorithm
			3) sentence_limit - floating percentange that limits the amount of sentences gathered
		"""

		self.url = url
		self.summary = Summarizer(url, keyword_limit, sentence_limit)
		self.soup = self.open_url()
		self.title = None
		self.author = None
		self.description = None
		self.url = None


	def run(self):
		"""Driver function for this class."""

		#Gather relevant info
		self.gather_info()

		#Print relevant info to text file
		self.print_info()

		self.summary.fill_ignored()

		#Grab text from contents and pass to Summarizer module
		text = self.summary.clean_text()

		#Obtain summary
		summary = self.summary.grab_summary(text)

		#Print summary to text file
		self.summary.print_summary(summary)


	def open_url(self):
		"""Open the url and return the contents."""
		try:
			response = urlopen(self.url)
			html = response.read()
			soup = BeautifulSoup(html, "html.parser")

		except OSError as e:
			if e.code == 503:
				print("Querying site again!")
				return self.open_url()

		return soup


	def gather_info(self):
		"""
		Parse the contents of the soup in order to obtain the title and url
		for the article.
		"""

		self.title = self.soup.find("title")

		if self.title:
			title_space = self.title.contents[0].find(" ")
			self.title = self.title.contents[0][0:title_space]

		self.url = self.soup.findAll("link")

		if self.url:
			for url in self.url:
				url = url['href']
				if url.find(self.title) != -1 and url.find("https://en.wikipedia.org") != -1:
					self.url = url
					break


	def print_info(self):
		"""Print out the relevant info where applicable."""

		#Write all the information to a text file that can handle utf-8 to avoid encoding errors.
		with open("summary.txt", "wb") as f:
			f.write(("Title: {}\n".format(self.title)).encode('utf-8'))
			f.write(("Url: {}\n".format(self.url)).encode('utf-8'))
Ejemplo n.º 12
0
def ripArticlesChild(fargs):
    """
    Search NYTimes for a company.
    Read all applicable articles.
    Summarize all articles.

    """
    stockName = fargs[0]
    links = fargs[1]
    refArt = fargs[2]  # Clear out saved articles and summarize from scratch
    upArt = fargs[3]  # Add new articles to saved

    # https://github.com/miso-belica/sumy
    # XPATH of parent container of article body=    //*[@id="story"]/section
    # XPATH of first body subsection=     //*[@id="story"]/section/div[1]
    quote_page = 'https://www.nytimes.com/'
    qpage = 'https://www.nytimes.com/search?query='

    if refArt:
        # If resetting article data
        data = [defaultdict(list), []]
    else:
        if os.path.isfile('summarizedArticles/' + stockName + '.json'):
            with open('summarizedArticles/' + stockName + '.json', 'r') as f:
                # [ {datetime:[summarized article]}, [included urls] ]
                # [ dict of lists, list ]
                data = json.load(f)
            if not upArt:
                # If not updating the articles
                return stockName, data
        else:
            data = [defaultdict(list), []]

    for link in links:
        if link in data[1]:
            continue
        data[1].append(link)
        r = requests.get(link)
        soup = BeautifulSoup(r.content, 'html.parser')
        reg = re.compile('.*StoryBodyCompanionColumn.*')
        f = soup.find_all('div', attrs={'class': reg})
        text = ''
        if soup.find('time'):
            articleDate = soup.find('time')['datetime']
            try:
                datetime.strptime(articleDate, "%Y-%m-%d")
            except ValueError:
                try:
                    articleDate = articleDate.replace("Sept", "Sep")
                    dt = datetime.strptime(articleDate, "%b. %d, %Y")
                    articleDate = dt.strftime("%Y-%m-%d")
                except ValueError:
                    # Has UTC offset
                    dt = datetime.strptime(articleDate, "%Y-%m-%dT%H:%M:%S%z")
                    articleDate = dt.strftime("%Y-%m-%d")
        else:
            # Not an article
            continue
        # Get date and time of publication

        for k in f:  # for each BodyCompanionColumn
            k = k.find_all('p')  # Find all paragraphs
            for p in k:  # for each paragraph
                # get text
                text += p.text

        summarizedText = Summarizer.getSummary(text.strip())
        keywords = Summarizer.getKeywords(summarizedText)
        if articleDate in data[0].keys():
            data[0][articleDate].append(keywords)
        else:
            data[0][articleDate] = [keywords]

    with open('summarizedArticles/' + stockName + '.json', 'w+') as f:
        json.dump(data, f, separators=(',', ':'))

    print("Finished %s" % stockName)
    return stockName, data
import ConfigParser

config = ConfigParser.ConfigParser()
config.read("config.ini")
module_name = "Summarizer_Loop"
logger_format = "%(asctime)s [%(module)s]: %(message)s"
logging.basicConfig(level=logging.DEBUG, format=logger_format)
logger = logging.getLogger(config.get("Logger", "logger_name"))
debug_log = logging.FileHandler("Summarizer_Loop Debug.log")
debug_log.setLevel(logging.DEBUG)
debug_log.setFormatter(logging.Formatter(logger_format))
info_log = logging.FileHandler("Summarizer_Loop Info.log")
info_log.setLevel(logging.INFO)
info_log.setFormatter(logging.Formatter(logger_format))

logger.addHandler(debug_log)
logger.addHandler(info_log)

logger.info("---------- Begin Summarizer_Loop execution ----------")

while True:
    logger.info("----- Excecuting Summarizer -----")
    Summarizer.main()
    with open("Summarizer_Loop Execution Log.txt", "a") as log_file:
        log_file.write("Execution completed at %r" % time.strftime("%m/%d/%y %H:%M:%S", time.localtime()))
        log_file.write("\n")
    sleeptime = config.getfloat(module_name, "wait_time")
    logger.info("Waiting for %r minutes before next execution." % sleeptime)
    logger.info("Start wait...")
    time.sleep(sleeptime * 60)
Ejemplo n.º 14
0
Python 3.4.1 (v3.4.1:c0e311e010fc, May 18 2014, 10:38:22) [MSC v.1600 32 bit (Intel)] on win32
Type "copyright", "credits" or "license()" for more information.
>>> ================================ RESTART ================================
>>> 
>>> import Summarizer as summe
>>> summe.getSentences("""	Since the launch of the Nintendo Entertainment System in 1983, Nintendo has been offering the world unique and original entertainment products under the development concept of hardware and software integration. In the field of home entertainment, the video game industry is one of the few industries established in Japan that spread around the world, and Nintendo has established itself as a well-known brand truly representing video game culture throughout the world.

	With the belief that the raison d'etre of entertainment is to put smiles on people's faces around the world through products and services, what we have focused on for the last decade is our basic strategy of expanding the gaming population by offering products which can be enjoyed by everyone regardless of age, gender or gaming experience. In addition, as the business environment around us has shifted with the times, we have decided to redefine entertainment as something that improves people's quality of life ("QOL") in enjoyable ways and expand our business areas. What Nintendo will try to achieve in the next 10 years is a platform business that improves people's QOL in enjoyable ways.

	We believe that we can capitalize the most on our strengths through a hardware-software integrated platform business, and therefore this type of dedicated video game platforms will remain our core focus. We will continue to value the spirit of originality described in our motto "The True Value of Entertainment Lies in Individuality," and will continue to provide products and services which pleasantly surprise people.

	With a platform business that improves people's QOL in enjoyable ways, we will attempt to establish a new business area apart from our dedicated video game business. We have set "health" as the theme for our first step and we will try to use our strength as an entertainment company to create unique approaches that expand this business. Through our new endeavors with the QOL-improving platform, we strive to further promote our existing strategy of expanding our user base, create an environment in which more people are conscious about their health and in turn expand Nintendo's overall user base.

	After Nintendo started the manufacture and sale of Hanafuda (traditional Japanese playing cards) 125 years ago, it has innovated itself from a playing card company to a toy company, a toy company to an electronic toy company and finally from an electronic toy company to a company developing video game platforms. Nintendo has continued to try new things, and with a history of experiencing many failures and small successes, we managed to pioneer the home video game market. What has remained the same from the past is that we have always tried to create something new from materials and technologies available at that time, to position entertainment as our core business and to improve people's QOL in enjoyable ways. We will continue to value self-innovation in line with the times and aim for growth.

	Nintendo intends to make progress with the support and encouragement of its shareholders and investors.""")
en Since the launch of the Nintendo Entertainment System in 1983, Nintendo has been offering the world unique and original entertainment products under the development concept of hardware and software integration.
en In the field of home entertainment, the video game industry is one of the few industries established in Japan that spread around the world, and Nintendo has established itself as a well-known brand truly representing video game culture throughout the world.
en With the belief that the raison d'etre of entertainment is to put smiles on people's faces around the world through products and services, what we have focused on for the last decade is our basic strategy of expanding the gaming population by offering products which can be enjoyed by everyone regardless of age, gender or gaming experience.
en In addition, as the business environment around us has shifted with the times, we have decided to redefine entertainment as something that improves people's quality of life ("QOL") in enjoyable ways and expand our business areas.
en What Nintendo will try to achieve in the next 10 years is a platform business that improves people's QOL in enjoyable ways.
en We believe that we can capitalize the most on our strengths through a hardware-software integrated platform business, and therefore this type of dedicated video game platforms will remain our core focus.
en We will continue to value the spirit of originality described in our motto "The True Value of Entertainment Lies in Individuality," and will continue to provide products and services which pleasantly surprise people.
en With a platform business that improves people's QOL in enjoyable ways, we will attempt to establish a new business area apart from our dedicated video game business.
en We have set "health" as the theme for our first step and we will try to use our strength as an entertainment company to create unique approaches that expand this business.
en Through our new endeavors with the QOL-improving platform, we strive to further promote our existing strategy of expanding our user base, create an environment in which more people are conscious about their health and in turn expand Nintendo's overall user base.
en After Nintendo started the manufacture and sale of Hanafuda (traditional Japanese playing cards) 125 years ago, it has innovated itself from a playing card company to a toy company, a toy company to an electronic toy company and finally from an electronic toy company to a company developing video game platforms.
en Nintendo has continued to try new things, and with a history of experiencing many failures and small successes, we managed to pioneer the home video game market.
en What has remained the same from the past is that we have always tried to create something new from materials and technologies available at that time, to position entertainment as our core business and to improve people's QOL in enjoyable ways.
en We will continue to value self-innovation in line with the times and aim for growth.
en Nintendo intends to make progress with the support and encouragement of its shareholders and investors.
Ejemplo n.º 15
0
inference_targets=True


d=round(len(converted_summaries)*0.9)

summarizer_model_utils.reset_graph()
summarizer = Summarizer.Summarizer(word2ind,
                                   ind2word,
                                   save_path='./models/sogou/my_model',
                                   mode='TRAIN',
                                   num_layers_encoder = num_layers_encoder,
                                   num_layers_decoder = num_layers_decoder,
                                   rnn_size_encoder = rnn_size_encoder,
                                   rnn_size_decoder = rnn_size_decoder,
                                   batch_size = 32,
                                   clip = clip,
                                   keep_probability = keep_probability,
                                   learning_rate = learning_rate,
                                   max_lr=max_lr,
                                   learning_rate_decay_steps = learning_rate_decay_steps,
                                   learning_rate_decay = learning_rate_decay,
                                   epochs = epochs,
                                   pretrained_embeddings_path = None, #pretrained_embeddings_path,
                                   use_cyclic_lr = use_cyclic_lr,
                                   summary_dir = None)#summary_dir)           

summarizer.build_graph()
summarizer.train(converted_texts[:d], 
                 converted_summaries[:d],
                 validation_inputs=converted_texts[d:],
                 validation_targets=converted_summaries[d:])
Ejemplo n.º 16
0
from Summarizer import *

# enter links into the 'links.txt' file. If you just want to use the summarizer, you can download that specific file.
f = open('links.txt', 'r')

for url in f:
    #input error handling
    try:
        summaryLength = int(input('Enter how many lines do you want in the summary: '))
        print(url, '\n')
        summary, polarity, subjectivity = Summarizer(url, summaryLength).summarize_text()
        print(summary)
        print('\nPolarity (-1 to 1):', polarity)
        print('Subjectivity (-1 to 1):', polarity,'\n')
    except:
        print('Input gave an error!')
Ejemplo n.º 17
0
    def handleSummarizeDocument(self, form):
        pdfName = ""
        try:
            if form.getvalue("pdfName"):
                pdfName = form.getvalue("pdfName")
        except:
            self.wfile.write(bytes("No object pdfName", "utf-8"))

        classSummarizer = Summarizer.SummaryMaker("upload/Cours/" + pdfName)
        classSummarizer.organizePoly()

        if form["dsNames"].value and not form["dsNames"].value == "null":
            lesDS = form["dsNames"].value
            lesDS = lesDS.split('+')
            lesDS_Path = []
            for d in lesDS:
                lesDS_Path.append("upload/DS/" + d)

            classSummarizer.addNewDS(lesDS_Path)
        '''
        list_json = []
        i = 0

        if form["dsNames"].value:
            if classSummarizer.dsAnalyser.important_words:
                list_json.append({"motsImportantsDS" : classSummarizer.dsAnalyser.important_words})
        else:
            list_json.append({"motsImportantsDS" : ""})

        for partie in classSummarizer.map_nomParties_contenu.keys():
            if(partie == "Sommaire" or partie == "Table des matières"):
                continue
            if i < 5:
                summary = classSummarizer.summarisePartie(partie)  
                l  = []
                # Mots importants
                if classSummarizer.map_nomParties_motsImportants[partie]:
                    l.append({"motsImportants": classSummarizer.map_nomParties_motsImportants[partie]})
                else:
                    l.append({"motsImportants": []})
                # Texte
                if summary:
                    for s in summary:
                        l.append(s)
                else:
                    l.append("")
                d = {partie : l}
                list_json.append(d)
                i=i+1
        '''

        list_jsonElement = []
        for partie in classSummarizer.map_nomParties_contenu.keys():
            if (partie == "Sommaire" or partie == "Table des matières"):
                continue
            jsonElement = dict()
            # Génération du résumé
            summary = classSummarizer.summarisePartie(partie)
            if summary != "false":
                # titre
                jsonElement["titre"] = partie
                # Mots importants
                l_motsImportants = []
                if classSummarizer.map_nomParties_motsImportants[partie]:
                    l_motsImportants = (
                        classSummarizer.map_nomParties_motsImportants[partie])
                else:
                    l_motsImportants.append("")

                jsonElement["motsImportants"] = l_motsImportants
                # Texte
                l_ideesCles = []
                if summary:
                    for s in summary:
                        l_ideesCles.append(s)
                else:
                    l_ideesCles.append("")

                jsonElement["ideesCles"] = l_ideesCles

                # Ajouter au list_jsonElement
                list_jsonElement.append(jsonElement)

        self.wfile.write(
            str(json.dumps(list_jsonElement, ensure_ascii=True)).encode())