Example #1
0
def summarize(filename,option):
    #LexicalChain(os.getcwd()+'/../filename')
    summary = textrank(filename,original=option,words=100)
    op_name='summary'+filename[4:]
    text_file = open('../RougeEval/syssum/'+option+'/'+op_name, "w")
    text_file.write(summary)
    text_file.close()
    return
def doMagic(userId, channelId, token):
    # Get all unseen messages
    data = getMessageData(channelId, token)

    messageTexts = []

    for message in data:
        txt = message['text'].decode('utf-8', 'ignore')
        messageTexts.append(txt)

    # # Group into clusters
    msgs = generateListOfMessages(data)
    clusterIndicies = getClusters(msgs)
    labels = clusterIndicies.keys()

    messageClusters = []
    for label in labels:
        indicies = clusterIndicies.get(label)
        cluster = []
        for index in indicies:
            cluster.append(messageTexts[index])
        messageClusters.append(cluster)

    # Find important clusters
    res = []
    for cluster in messageClusters:
        document = ""
        for message in cluster:
            document += message + " "

        text_ranks = textrank(document)

        numMessages = math.ceil(len(text_ranks) * .3)

        importantMessagesInCluster = []
        for i in range(int(numMessages)):
            item = {"text": text_ranks[i][1]}

            importantMessagesInCluster.append(item)
        res.append(importantMessagesInCluster)
    return res
Example #3
0
def summarize(text, topn=None):
    stop_words = load_stopwords('data/stopwords.txt')
    sentences = tokenize_into_sentences(text)

    topn = len(sentences) // 3 if not topn else topn
    print(
        "Generating top {} most relevant sentences out of {} total sentences".
        format(topn, len(sentences)))

    sentences_processed = list(map(process_text, sentences))
    remove_stop = partial(remove_stopwords, stop_words)
    sentences_tokenized = [
        sentence for sentence in map(lambda x: remove_stop(x.split()),
                                     sentences_processed) if sentence
    ]

    matrix, ranks = textrank(sentences_tokenized)
    res = []
    for tup in ranks[:topn]:
        idx = tup[0]
        res.append((sentences[idx], tup[1]))
    return res
Example #4
0
    for i, x in enumerate(word):
        l = len(sentence[i].split(' '))
        #print(l)
        sums = 0
        j = 0
        for y in x:
            sums += nodes[nodes[:, 0] == y, 2][0]
            j += 1
        if j > 0:
            #sentenceWeight[i]=sums/j ### average based
            #sentenceWeight[i]=sums ### score based
            #sentenceWeight[i]=sums/(1+math.log10(j)) ### log based
            sentenceWeight[i] = sums / l  ### sentence average based
            #print(sums)

    sentence = [[i, sentence[i], sentenceWeight[i]]
                for i, x in enumerate(word)]
    #sentence = np.sort(np.array(sentence,dtype=object),axis=-0)
    sentence = sorted(sentence, key=lambda x: -x[2])
    sentence = np.array(sentence)
    return sentence


if __name__ == "__main__":
    path = "data/body/"
    filens = os.listdir(path)
    stops = stopwords.words('english')
    sentence, word = preprocess(path + filens[0])
    nodes = textrank(word)
    generate_summary_bylength(sentence, word, nodes, filens[0])
def summarize(text):
    # SPLIT TO PARAGRAPHS
    pre_paragraphs = text.split('\n')
    paragraphs = []
    for i, p in enumerate(pre_paragraphs):
        if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1
                                          or re.match(r'^\s*$',
                                                      pre_paragraphs[i + 1])):
            paragraphs.append(p)
    # print(f'Num of paragraphs: {len(paragraphs)}')
    # for i, p in enumerate(paragraphs):
    #     print(f'par#{i+1}: {p}')

    # SPLIT TO SENTENCES
    sentences = separator.separate(text)
    print(f'Num of sentences: {len(sentences)}')
    for i, s in enumerate(sentences):
        print(f'#{i+1}: {s}')

    # TOKENIZE
    stem = False
    if stem:
        tokenized_sentences = [[
            czech_stemmer.cz_stem(word, aggressive=False) for word in sentence
        ] for sentence in tokenize(sentences)]
    else:
        tokenized_sentences = tokenize(sentences)

    # REMOVE STOPWORDS
    tokenized_sentences_without_stopwords = remove_stop_words(
        tokenized_sentences, keep_case=False)
    sentences_without_stopwords_case = remove_stop_words(
        sentences, keep_case=True, is_tokenized=False, return_tokenized=False)
    print('===Sentences without stopwords===')
    for i, s in enumerate(tokenized_sentences_without_stopwords):
        print(f'''#{i+1}: {' '.join(s)}''')

    print('===Sentences without stopwords CASE===')
    for i, s in enumerate(sentences_without_stopwords_case):
        print(f'''#{i+1}: {s}''')

    # POS-TAG
    tagged_sentences = pos_tag(sentences_without_stopwords_case)
    print('=====Tagged_sentences=====')
    for i, s in enumerate(tagged_sentences):
        print(f'''#{i+1}: {s}''')

    # 1. THEMATICITY FEATURE
    thematicity_feature_scores = thematicity_feature(
        tokenized_sentences_without_stopwords)

    # 2. SENTENCE POSITION FEATURE - NOTE: shitty!
    sentence_position_scores = sentence_position_feature(len(sentences))

    # 3. SENTENCE LENGTH FEATURE
    sentence_length_scores = sentence_length_feature(tokenized_sentences)

    # 4. SENTENCE PARAGRAPH POSITION FEATURE

    # 5. PROPER_NOUN FEATURE
    proper_noun_scores = proper_noun_feature(tagged_sentences)

    # 6. NUMERALS FEATURE
    numerals_scores = numerals_feature(tokenized_sentences)

    # 7. NAMED ENTITIES FEATURE - very similar to PROPER_NOUN FEATURE

    # 8. TF_ISF FEATURE - NOTE: TextRank instead of TS_ISF ??? ts_isf_orig is meh
    tf_isf_scores = tf_isf_orig_feature(tokenized_sentences_without_stopwords)

    # 9. CENTROID SIMILARITY FEATURE
    centroid_similarity_scores = centroid_similarity_feature(
        sentences, tf_isf_scores)

    # 10. UPPER-CASE FEATURE (not in the paper)
    upper_case_scores = upper_case_feature(tokenized_sentences)

    # 11. QUOTES FEATURE (not in the paper)
    quotes_scores = quotes_feature(sentences)

    # 12. REFERENCES FEATURE (not in the paper)
    references_scores = references_feature(tokenized_sentences)

    # 13. TEXTRANK FEATURE (not in the paper)
    textrank_scores = textrank.textrank(tokenized_sentences, True,
                                        '4-1-0.0001')

    feature_matrix = []
    feature_matrix.append(thematicity_feature_scores)
    feature_matrix.append(sentence_position_scores)
    feature_matrix.append(sentence_length_scores)
    feature_matrix.append(proper_noun_scores)
    feature_matrix.append(numerals_scores)
    feature_matrix.append(tf_isf_scores)
    feature_matrix.append(centroid_similarity_scores)
    feature_matrix.append(upper_case_scores)

    features = [
        '  thema', 'sen_pos', 'sen_len', '  propn', '    num', ' tf_isf',
        'cen_sim', '  upper'
    ]

    feature_matrix_2 = np.zeros((len(sentences), len(features)))
    for i in range(len(features)):
        for j in range(len(sentences)):
            feature_matrix_2[j][i] = feature_matrix[i][j]

    feature_sum = []
    for i in range(len(np.sum(feature_matrix_2, axis=1))):
        feature_sum.append(np.sum(feature_matrix_2, axis=1)[i])

    print('=====Scores=====')
    print(35 * ' ', end='|')
    for f in features:
        print(f, end='|')
    print()
    for i, s in enumerate(sentences):
        print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|')
        for f_s in feature_matrix:
            print('{: .4f}'.format(round(f_s[i], 4)), end='|')
        print('{: .4f}'.format(round(feature_sum[i], 4)))

    print('Training rbm...')
    rbm_trained = rbm.test_rbm(dataset=feature_matrix_2,
                               learning_rate=0.1,
                               training_epochs=14,
                               batch_size=5,
                               n_chains=5,
                               n_hidden=len(features))
    # another implementation of rbm, from sklearn
    # rbm2 = BernoulliRBM(n_components=len(features), n_iter=14, batch_size=5, learning_rate=0.1)
    # rbm_trained = rbm2.fit_transform(feature_matrix_2)
    # print(rbm_trained)
    rbm_trained_sums = np.sum(rbm_trained, axis=1)

    print('=====RBM Enhanced Scores=====')
    print(35 * ' ', end='|')
    for f in features:
        print(f, end='|')
    print()
    for i, s in enumerate(sentences):
        print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|')
        for f_s in rbm_trained[i]:
            print('{: .4f}'.format(round(f_s, 4)), end='|')
        print('{: .4f}'.format(round(rbm_trained_sums[i], 4)))

    enhanced_feature_sum = []
    feature_sum = []

    for i in range(len(np.sum(rbm_trained, axis=1))):
        enhanced_feature_sum.append([np.sum(rbm_trained, axis=1)[i], i])
        feature_sum.append([np.sum(feature_matrix_2, axis=1)[i], i])

    print(f'enhanced_feature_sum: {enhanced_feature_sum}')
    print(f'feature_sum: {feature_sum}')

    enhanced_feature_sum.sort(key=lambda x: x[0])
    feature_sum.sort(key=lambda x: -1 * x[0])
    print('=====Sorted=====')
    print(f'enhanced_feature_sum: {enhanced_feature_sum}')
    print(f'feature_sum: {feature_sum}')

    # print('=====The text=====')
    # for x in range(len(sentences)):
    #     print(sentences[x])

    extracted_sentences_rbm = []
    extracted_sentences_rbm.append([sentences[0], 0])
    extracted_sentences_simple = []
    extracted_sentences_simple.append([sentences[0], 0])

    summary_length = max(min(round(len(sentences) / 4), 12),
                         3)  # length between 3-12 sentences
    for x in range(summary_length):
        if enhanced_feature_sum[x][1] != 0:
            extracted_sentences_rbm.append([
                sentences[enhanced_feature_sum[x][1]],
                enhanced_feature_sum[x][1]
            ])
        if feature_sum[x][1] != 0:
            extracted_sentences_simple.append(
                [sentences[feature_sum[x][1]], feature_sum[x][1]])

    extracted_sentences_rbm.sort(key=lambda x: x[1])
    extracted_sentences_simple.sort(key=lambda x: x[1])

    final_text_rbm = ''
    for i in range(len(extracted_sentences_rbm)):
        final_text_rbm += extracted_sentences_rbm[i][0] + '\n'
    final_text_simple = ''
    for i in range(len(extracted_sentences_simple)):
        final_text_simple += extracted_sentences_simple[i][0] + '\n'

    print('=====Extracted Final Text RBM=====')
    print(final_text_rbm)
    print()
    print('=====Extracted Final Text simple=====')
    print(final_text_simple)

    return final_text_rbm
def download_articles_from_url(api_url, download_directory):
	# Use the API URL to get a list of articles
	api_req = requests.get(api_url)
	article_list = api_req.text.split('\n')

	# Shuffle the article list to avoid being blocked
	random.shuffle(article_list)
	
	# Creates a Simple Summarizer for summarizing articles
	ss = summarize.SimpleSummarizer()

	for articleURL in article_list:
		if (articles.find_one({ "url": articleURL }) == None):
			#print 'Trying: ' + articleURL.encode('ascii', 'ignore')

			if (validate_url(articleURL) is False):
				continue
		
			# For each URL, assign its md5 as a unique identifier
			m = hashlib.md5()
			m.update(articleURL)
			code = m.hexdigest()
	       		first_level = code[0:2]
			second_level = code[2:4]
					
			# This code also becomes the filename for the full file path
			articleFileDirectory = download_directory + first_level + "/" + second_level + "/"
			articleFilePath = articleFileDirectory + code

			# TODO: Parse title from article

			# Download full article and use full-text (if available) for keyword extraction
			fullArticleText = download_article_file(articleURL, articleFileDirectory, code)
			
			if (fullArticleText is not None):
				keyword_set = textrank(fullArticleText) 
				#articleFeatures = get_article_features(fullArticleText, articleURL)
				articleFeatures = None
				guessed_date = guess_date(fullArticleText)
				summaryText = ss.summarize(fullArticleText,5) # 2nd input is number of lines in summary
			else:
				guessed_date = ""
				# TODO: Fix
				print "ERROR: Full article text not available"
				#keyword_set = textrank(summaryText)
				#articleFeatures = get_article_features(summaryText, articleURL)
				articleFeatures = None
				continue
				
			keywords = list(keyword_set)

			print "Downloaded: " + articleURL.encode('ascii', 'ignore')

			processed_date = datetime.now().strftime("%Y-%m-%d")
			if (guessed_date is not None):
				publish_date = guessed_date
			else:
				publish_date = processed_date			

			article = [{
				#"q": query, # TODO: Fix
				"_id": code,
				"c": code,
				"f": articleFeatures,
				"pubd": publish_date,
				"procd": processed_date,
				"url": articleURL,
				#"t": articleTitle, # TODO: Fix
				"abs": summaryText, # TODO: Fix
				#"sr": articleSource, # TODO: Fix
				"k": keywords,
				"fp": articleFilePath,
				"m": None
				}]
			
			# Write article to MongoDB collection
			try:
				article_id = articles.insert(article)
			except MongoException.DuplicateKey:
				print "Duplicate key: " + code
	
			#print "Inserted into articles: " + articleTitle.encode('ascii', 'ignore')
			title = '' # TODO: Fix
			abstract = '' # TODO: Fix
			json_str = mk_es_json(code, fullArticleText, articleURL, title, abstract, publish_date) 
			#print json_str
			index = 'article'
			index_type = 'text'
			es_url = 'http://localhost:9200'
    			r = post_to_elastic_search(es_url, index, index_type, code, json_str)
			print r
def parse_webpages(php_directory, term, option, excludes):
	api_base_url = AAFTER_URL
	api_args = "&wt=xml&fl=*,score"
	file_name = (term + '-' + option).replace(" ", "_").replace("/", "_")
	webpageFileDirectory = php_directory + file_name + "--webpages" + "/"
	url_term = urllib2.quote('"' + term + '" ' + option + ' ' + excludes)

	try:
		api_url = api_base_url + url_term + api_args
		#print "Downloading XML from " + api_url
		xml_response = urllib2.urlopen(api_url)

	except urllib2.HTTPError:
		print "ERROR: HTTPError at " + api_url.encode('ascii', 'ignore')
		xml_response = ""

	except urllib2.URLError:
		print "ERROR: URLError at " + api_url.encode('ascii', 'ignore')
		xml_response = ""

	# Parse the XML responses
	xml_tree = etree.parse(xml_response)

	query = xml_tree.xpath("//response/lst[@name='responseHeader']/lst[@name='params']/str[@name='q']/text()")
	num_result = xml_tree.xpath("//response/result")[0].attrib['numFound']

	# Each website result will be stored as a list
	titles = xml_tree.xpath("//response/result/doc/str[@name='name']/text()")
	urls = xml_tree.xpath("//response/result/doc/str[@name='url_s']/text()")
	scores = xml_tree.xpath("//response/result/doc/float[@name='score']/text()")

	# Count the number of urls passed in XML and use that as the basis for how many results are on the page
	url_count = xml_tree.xpath("count(//response/result/doc/str[@name='url_s'])")

	meta_descriptions = meta_keywords = summaries = []

	# Add summary and meta information from Subhankar's API
	# Use loop to avoid IndexError if field does not exist
	for i in range(len(urls)):
		try:
			md = xml_tree.xpath("//response/result/doc/arr[@name='features']/str[1]/text()")[i]
			meta_descriptions.append(md)
		except IndexError:
			meta_descriptions.append("")

		try:
			mk = xml_tree.xpath("//response/result/doc/arr[@name='features']/str[2]/text()")[i]
			meta_keywords.append(mk)
		except IndexError:
			meta_keywords.append("")

		try:
			s = xml_tree.xpath("//response/result/doc/arr[@name='features']/str[3]/text()")[i]
			summaries.append(s)
		except IndexError:
			summaries.append("")
		

	for i in range(len(urls)):
		# Check to see if webpage has already been inserted. If it has, don't do anything
		if (webpages.find_one({ "url": urls[i] }) == None):
			
			fullWebpageText = None
			#code = base64.urlsafe_b64encode(os.urandom(18))
			m = hashlib.md5()
			m.update(urls[i])
			code = m.hexdigest()
			webpageFilePath = webpageFileDirectory + code

			# Download full webpage and use full-text (if available) for keyword extraction
		
			# If a directory for files doesn't exist, create it
			dir = os.path.dirname(webpageFileDirectory)

			if not os.path.isdir(dir):
				#print "Created directory: " + dir
				os.makedirs(dir)
			
			try:	
				#fullWebpage = urllib2.urlopen(urls[i])
				#print "Opening website URL: " + str(urls[i])
				#fullWebpageHTML = fullWebpage.read()


				# Use boilerpipe to clean text
				extractor = Extractor(extractor='ArticleExtractor', url=urls[i])
				#fullWebpageHTML = extractor.getHTML()
				fullWebpageText = extractor.getText()

				# Use lxml's HTML cleaner to remove markup
				#htmltree = lxml.html.fromstring(fullWebpageText)		
				#cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True)
				#cleaned_tree = cleaner.clean_html(htmltree)
				#fullWebpageText = cleaned_tree.text_content()

				outfile = open(webpageFilePath, 'w+')
				outfile.write(fullWebpageText.encode('ascii', 'ignore'))
				outfile.close

			except urllib2.HTTPError:
				print "HTTPError: Webpage file download skipped: " + urls[i]
				return None

			except urllib2.URLError:
				print "URLError: Webpage file download skipped: " + urls[i]
				return None

			except UnicodeDecodeError:
				print "UnicodeDecodeError: Webpage file download skipped: " + urls[i]
				return None

			except lxml.etree.ParserError:
				print "lxml.etree.ParserError: Webpage file download skipped: " + urls[i]
				return None

			except LookupError:
				print "LookupError: Webpage file download skipped: " + urls[i]
				return None

			if (fullWebpageText is not None):
				keyword_set = textrank(fullWebpageText) 
			else:
				keyword_set = textrank(summaries[i])
		
			keywords = list(keyword_set)
		
			webpage = [{
			"q": query,
			"nr": num_result,
			"url": urls[i],
			"t": titles[i],
			"c": code,
			"md": meta_descriptions[i],
			"mk": meta_keywords[i],
			"abs": summaries[i],
			"s": scores[i],
			"k": keywords,
			"f": webpageFilePath
			}]

			webpage_id = webpages.insert(webpage)
def parse_news_articles(php_directory, download_directory, file_name, query):
	# Note: Assumes that path is stored as <query>.php/
	inpath = php_directory + file_name + "/"
	file_list = [ f for f in listdir(inpath) if isfile(join(inpath,f)) ]

	# For each file, get the article Titles and URLs
	for file in file_list:
		# Clear out any variables from last file
		articleURL = articleTitle = articleSource = summaryText = keywords = score = code = ""
	
		try:	
			intext = open(inpath + file, 'r').read()
			html = etree.HTML(intext)
		except lxml.etree.XMLSyntaxError:
			print "ERROR: XMLSyntaxError when reading " + inpath + file
			break

		for element in html.iter():
			if (element.tag == "p" and element.text == "News Result"):
				# Do nothing
				pass
	
			elif (element.tag == "a"):
				articleURL = element.attrib["href"]
				articleTitle = element.text

			elif (element.tag == "br"):
				if (element.tail != None):
					summaryText = element.tail

			elif (element.tag == "strong"):
				if (element.tail != "\n"):
					articleSource = element.tail

			elif (element.tag == "p"):
				# Check to see if article already exists using URL. If it exists, don't do anything
				if (articles.find_one({ "url": articleURL }) is not None):
					print "INFO: Duplicate article found"
				else:
					print "Processing: " + articleURL			
	
					# For each URL, assign its md5 as a unique identifier
					#code = base64.urlsafe_b64encode(os.urandom(18))
					m = hashlib.md5()
					m.update(articleURL)
					code = m.hexdigest()
        				first_level = code[0:2]
					second_level = code[2:4]
					
					# This code also becomes the filename for the full file path
					#articleFileDirectory = php_directory + file + "--news/"
					articleFileDirectory = download_directory + first_level + "/" + second_level + "/"
					articleFilePath = articleFileDirectory + code

					# Download full article and use full-text (if available) for keyword extraction
					fullArticleText = download_article_file(articleURL, articleFileDirectory, code)
			
					if (fullArticleText is not None):
						keyword_set = textrank(fullArticleText) 
						#articleFeatures = get_article_features(fullArticleText, articleURL)
						articleFeatures = None
						guessed_date = guess_date(fullArticleText)
					else:
						keyword_set = textrank(summaryText)
						#articleFeatures = get_article_features(summaryText, articleURL)
						articleFeatures = None
						guessed_date = guess_date(summaryText)
				
					keywords = list(keyword_set)
                        		
					processed_date = datetime.now().strftime("%Y-%m-%d")
					if (guessed_date is not None):
						publish_date = guessed_date
					else:
						publish_date = processed_date
	
					article = [{
					"q": query,
					"c": code,
					"f": articleFeatures,
					"pubd": publish_date,
					"procd": processed_date,
					"url": articleURL,
					"t": articleTitle,
					"abs": summaryText,
					"sr": articleSource,
					"k": keywords,
					"fp": articleFilePath,
					"m": None
					}]
		
                        		# Write article to MongoDB collection
		                        try:
		                                article_id = articles.insert(article)
		                        except MongoException.DuplicateKey:
		                                print "Duplicate key: " + code

		                        #print "Inserted into articles: " + articleTitle.encode('ascii', 'ignore')

					if (fullArticleText is None):
						fullArticleText = summaryText

					# Insert into ElasticSearch	
		                        json_str = mk_es_json(code, fullArticleText, articleURL, articleTitle, summaryText, publish_date)
                		        #print json_str
		                        index = 'article'
              				index_type = 'text'
        		                es_url = 'http://localhost:9200'
                        		r = post_to_elastic_search(es_url, index, index_type, code, json_str)
		                        print r
Example #9
0
def results(algo=None):
    print("algorithm:", algo if algo in ALGOS else None)

    samples = 500  # up to 2000
    print("sample size:", samples)

    keys = glob.glob('Inspec/keys/*.key')
    res = [0] * samples

    if algo == 'textrank':
        # load a spaCy model, depending on language, scale, etc.
        nlp = spacy.load("en_core_web_sm")
        # add PyTextRank to the spaCy pipeline
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    elif algo == 'sentiment_pos' or algo == 'sentiment_pos_tfidf':
        sid = SentimentIntensityAnalyzer()

    for i, key in enumerate(keys[:samples]):
        # get actual keywords
        key_file = open(key)
        whitespace = re.compile(r"\s+")
        # remove whitespace and convert to lowercase
        actual = [
            whitespace.sub(" ", w).strip().lower()
            for w in key_file.readlines()
        ]

        # get text document corresponding to current key
        num = re.findall(r'\d+', key)[0]
        doc = 'Inspec/docsutf8/{}.txt'.format(num)

        # get extracted keywords
        if algo == 'rake':
            extracted = rake(doc)
        elif algo == 'textrank':
            extracted = textrank(doc, nlp)
        elif algo == 'window':
            extracted = window(doc)
        elif algo == 'window_w_tf_idf':
            extracted = window_w_tf_idf(doc)
        elif algo == 'tf_idf':
            extracted = tf_idf(doc)
        elif algo == 'sentiment_pos':
            extracted = sentiment_pos(doc, sid)
        elif algo == 'sentiment_pos_tfidf':
            extracted = sentiment_pos_tfidf(doc, sid)
        else:
            extracted = extract(doc)

        # calculate results
        tp = len(set(extracted).intersection(
            set(actual)))  # number of true positives
        precision = tp / len(extracted)
        recall = tp / len(actual)
        f_measure = (2 * precision * recall) / (
            precision + recall) if precision + recall else 0
        res[i] = (precision, recall, f_measure)

    # calculate average results
    avg_res = [sum(x) / len(x) for x in zip(*res)]
    print("precision: {}, recall: {}, F-measure: {}".format(*avg_res))
def linkedin_summary(user):
	text = ''
	for line in user['ln']:
		text += ' ' + line
	return textrank(text)
def summarize(text):
    # SPLIT TO PARAGRAPHS
    pre_paragraphs = text.split('\n')
    paragraphs = []
    for i, p in enumerate(pre_paragraphs):
        if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1
                                          or re.match(r'^\s*$',
                                                      pre_paragraphs[i + 1])):
            paragraphs.append(p)
    # print(f'Num of paragraphs: {len(paragraphs)}')
    # for i, p in enumerate(paragraphs):
    #     print(f'par#{i+1}: {p}')

    # SPLIT TO SENTENCES
    sentences = separator.separate(text)
    print(f'Num of sentences: {len(sentences)}')
    for i, s in enumerate(sentences):
        print(f'#{i+1}: {s}')

    # TOKENIZE
    stem = False
    if stem:
        tokenized_sentences = [[
            czech_stemmer.cz_stem(word, aggressive=True) for word in sentence
        ] for sentence in tokenize(sentences)]
    else:
        tokenized_sentences = tokenize(sentences)

    # REMOVE STOPWORDS
    tokenized_sentences_without_stopwords = remove_stop_words(
        tokenized_sentences, keep_case=False)
    sentences_without_stopwords_case = remove_stop_words(
        sentences, keep_case=True, is_tokenized=False, return_tokenized=False)
    print('===Sentences without stopwords===')
    for i, s in enumerate(tokenized_sentences_without_stopwords):
        print(f'''#{i+1}: {' '.join(s)}''')

    print('===Sentences without stopwords CASE===')
    for i, s in enumerate(sentences_without_stopwords_case):
        print(f'''#{i+1}: {s}''')

    # POS-TAG
    tagged_sentences = pos_tag(sentences_without_stopwords_case)
    print('=====Tagged_sentences=====')
    for i, s in enumerate(tagged_sentences):
        print(f'''#{i+1}: {s}''')

    counter = 0
    summary_length = max(min(round(len(sentences) / 4), 15),
                         3)  # length between 3-15 sentences
    ranked_sentence_indexes = textrank.textrank(tokenized_sentences, True,
                                                '3-1-0.0001')
    print(f'ranked_sentence_indexes: {ranked_sentence_indexes}')
    # summary = ''
    # # add 1st sentence always
    # summary += f'{sentences[0]}\n'
    # counter += 1
    # ranked_sentence_indexes.remove(0)
    # # # add also 2nd sentence if it is in top 50%
    # if 1 in ranked_sentence_indexes[:len(ranked_sentence_indexes) // 2]:
    #     summary += f'{sentences[1]}\n'
    #     counter += 1
    #     ranked_sentence_indexes.remove(1)
    # for sentence_index in sorted(ranked_sentence_indexes[:summary_length - counter]):
    #     if counter == summary_length:
    #         break
    #     summary += f'{sentences[sentence_index]}\n'
    #     counter += 1
    # summary += f'::::: Sentences in original: {len(sentences)}. Sentences in summary: {summary_length}. :::::'
    # add 1st sentence always
    summary = []
    summary.append(sentences[0])
    counter += 1
    ranked_sentence_indexes.remove(0)
    # # add also 2nd sentence if it is in top 50%
    if 1 in ranked_sentence_indexes[:len(ranked_sentence_indexes) // 2]:
        summary.append(sentences[1])
        counter += 1
        ranked_sentence_indexes.remove(1)
    for sentence_index in sorted(ranked_sentence_indexes[:summary_length -
                                                         counter]):
        if counter == summary_length:
            break
        summary.append(sentences[sentence_index])
        counter += 1
    return summary
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-q",
                        "--query",
                        nargs='?',
                        default='Airbus Subsidies',
                        type=str,
                        help='query')
    args = parser.parse_args()

    corpus_path = 'apnews_sen/apnews_sen.dat'

    with open(corpus_path, 'r') as corpus:
        combined_document = corpus.read()
        corpus.seek(0)
        documents = corpus.readlines()

    combined_document = combined_document[:1000]
    documents = documents[:1695]

    print(len(documents))
    print(len(set(documents)))

    N_docs = len(documents)

    # run BM25
    searcher = Searcher('apnews-config.toml')
    search_results = searcher.search(args.query, num_results=N_docs)

    dupe_dict = dict()
    for (doc_id, _) in search_results:
        if doc_id in dupe_dict:
            print('oh no')
            return

        dupe_dict[doc_id] = True

    combined_document = searcher.get_stringified_list(search_results)

    # run textrank from law__--less
    tokenized_sentences = tokenizer.remove_stopwords_and_clean(
        combined_document)
    # M_adj = graph_builder.create_sentence_adj_matrix(tokenized_sentences).astype(float)

    # word_model = ModelGen.train_model(tokenized_sentences)
    word_model = gensim.models.doc2vec.Doc2Vec.load(
        'model/apnews_sen_model.model')
    graph_model = DocumentGraph.DocumentGraph(tokenized_sentences, word_model)
    M_adj = graph_model.similarity_matrix

    M_adj = M_adj / np.sum(M_adj, axis=1)
    eigen_vectors = np.array(textrank.textrank(M_adj, d=.85))
    scores = textrank.get_sentence_scores(tokenized_sentences, eigen_vectors)

    assert (len(combined_document) == len(scores))

    print(scores)

    all_scores = np.ndarray((scores.shape[0], 2))
    all_scores[:, 1] = scores
    all_scores[:, 0] = [result[1] for result in search_results]
    z_scores = (all_scores - np.mean(all_scores, axis=0)) / np.std(all_scores,
                                                                   axis=0)
    averaged_scores = np.mean(z_scores, axis=1)

    # https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list
    sorted_docs = [
        doc
        for (avg_score, doc
             ) in sorted(zip(averaged_scores, combined_document), reverse=True)
    ]

    print(sorted_docs)
Example #13
0
else:
    os.makedirs(outDir)

path = "data/body/"
filens = os.listdir(path)
stops = stopwords.words('english')
for idf, filen in enumerate(filens):
    '''
    idf=0
    filen=filens[idf]
    '''
    print(idf)
    fpath = path + filen
    sentence, word, textlength, posword = preprocess(fpath, stops)
    #nodes = textrank(word)
    nodes = textrank(posword)
    #sentences = sentence_weight(sentence,word,nodes)
    sentences = sentence_weight(sentence, posword, nodes)
    #generate_summary_bylength(sentences,word,nodes,filen, outDir,l)
    generate_summary_bycompression(sentences, filen, outDir, l, textlength)
    #generate_summary_bylength_mmr(sentences,word,nodes,filen, outDir,l,0.7)
    #generate_summary_bycompression_mmr(sentences,word,nodes,filen, outDir,l,textlength,0.9)

#dataset 2
# l = [70,80,90,95,98]
# #l = [50,100,150,200]
# outDir = 'data2/data2-sys-summary-sen_avg-compression-300/'
# if os.path.exists(outDir):
#     shutil.rmtree(outDir)
#     os.makedirs(outDir)
# else: