def data_pre_train( data_path='data/data.json',train_path='data/train.txt' ): """ from=0 #文章开始id limit=10 # 返回文章数目 >>>data_pre_train(from=0, limit=10) [unused5] 标记关键词 [unused6] 标记标题 [unused7] 标记前文标题 [unused8] 标记正文 """ LANGUAGE = "chinese" SENTENCES_COUNT = 10 article_max_len=500 # tjson=tkit.Json(file_path=data_path) # data=tjson.auto_load() # print(len(data)) ttext=tkitText.Text() # extractor = tkit.TripleExtractor() # if len(data)>tfrom+limit: # data=data[tfrom:tfrom+limit] # elif len(data)<tfrom: # print("数据过短了,存在问t") # return [] # else: # data=data[tfrom:] # for item in tjson.auto_load(): stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0") f1 = open(train_path,'w') articles=[] # 引入TF-IDF关键词抽取接口 tfidf = analyse.extract_tags # 引入TextRank关键词抽取接口 textrank = analyse.textrank with open(data_path, 'r', encoding = 'utf-8') as data: for art_i,it in tqdm(enumerate(data)): item=json.loads(it[:-1]) # if art_i%10==0: # print('arti', art_i) segs_pre=[] segs_end=[] # # segs_pre.append(' [KW] '+item['keywords']+' [SEP] ') # # l=ttext.summary( item['content'],num=10) # # extractor = tkit.TripleExtractor() # # svos = extractor.triples_main(item['content']) # # extractor.clear() # # print('svos', svos) # parser = PlaintextParser.from_string(item['content'], Tokenizer(LANGUAGE)) # l=[] # for sentence in summarizer(parser.document, SENTENCES_COUNT): # l.append(str(sentence)) # # del sentence s=[] # # 这里开始处理关键词 关键语句等信息 # try: # for it in ie.get(item['title']+'\n'+item['content']): # # print(it) # if it==None: # pass # else: # s.append(''.join(list(it))) # # print(s) # except: # pass # # s=get_seq(item['title']+'\n'+item['content']) # # 基于TextRank算法进行关键词抽取 keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # # 输出抽取出的关键词 # # print(keywords) # # for keyword in keywords: # # print (keyword + "/",) # # 基于TF-IDF算法进行关键词抽取 # # keywords = tfidf(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # # print(keywords) # # 输出抽取出的关键词 # # for keyword in keywords: # # print( keyword + "/",) # # keywords1 =ttext.get_keywords(item['title']+'\n'+item['content']) # # new_keywords=[] # # for keyword in keywords1: # # new_keywords.append(keyword['word']) # # keywords =ttext.get_keyphrases(item['title']+'\n'+item['content']) # # kws=keywords+new_keywords # # # s.append(','.join(kws)) # s=[','.join(keywords)]+s segs_pre.append(' [KW] '+','.join(keywords)+' [/KW] ') # del s # # svos = extractor.triples_main('。'.join(l)) # #这里的屏蔽内容 try: segs_pre.append(' [TT] '+item['title']+" [/TT] ") segs_end.append(' [PT] '+item['title']+" [/PT] ") except: pass segs=sentence_seg(" [CLS] "+item['content']+" [END] ") article="".join(segs_pre+segs+segs_end) one=[] for i in range(len(article)//article_max_len+1): #截取内容 one.append(article[i*article_max_len:(i+1)*article_max_len]+"") articles.append("\n".join(one)+"") if art_i%100==0: print('arti', art_i) # f1.write("\n\n".join(articles)+"\n\n") f1.write("\n\n".join(articles)+"") articles=[] # del articles del segs f1.write("\n\n".join(articles)+"") f1.close() gc.collect() del stemmer del summarizer del ie gc.collect() return
def scrape(request): if request.method == 'POST': y = json.loads(request.body) url = y.get("url", None) print(url) driver = webdriver.PhantomJS( executable_path='../phantomjs/bin/phantomjs') driver.get(url) el = driver.find_element_by_tag_name("body") textContent = el.text driver.close() imageSourceUrls = imageDB.objects.values_list('sourceUrl', flat=True) imageSourceUrls = list(imageSourceUrls) textSourceUrls = textDB.objects.values_list('sourceUrl', flat=True) textSourceUrls = list(textSourceUrls) summary = textContent if url not in textSourceUrls or url not in imageSourceUrls: LANGUAGE = "english" SENTENCES_COUNT = 10 # parser = PlaintextParser.from_string(textContent,Tokenizer("english")) # summarizer = LuhnSummarizer() # summary = '' # for sentence in summarizer(parser.document, SENTENCES_COUNT): # summary = summary + str(sentence) # print("Summary ",summary) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summaryText = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): summaryText = summaryText + str(sentence) r = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) x = urlopen(r) codebase = BeautifulSoup(x, 'html.parser') title = codebase.title.string if not title: domain = urlparse(url) title = domain.hostname print(title) iconLink = codebase.find("link", rel="shortcut icon") if not iconLink: iconLink = ' ' else: iconLink = urljoin(url, iconLink.get('href')) textDB.objects.create(summaryText=summaryText, summary=summary, dateTime=timezone.now(), sourceUrl=url, title=title, icon=iconLink) scraper = Scraper() scraper.scrape(url) else: textDB.objects.filter(sourceUrl=url).delete() imageDB.objects.filter(sourceUrl=url).delete() print("DELETED") LANGUAGE = "english" SENTENCES_COUNT = 10 parser = PlaintextParser.from_string(textContent, Tokenizer("english")) summarizer = LuhnSummarizer() summary = '' for sentence in summarizer(parser.document, SENTENCES_COUNT): summary = summary + str(sentence) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summaryText = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): summaryText = summaryText + str(sentence) r = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) x = urlopen(r) codebase = BeautifulSoup(x, 'html.parser') title = codebase.title.string iconLink = codebase.find("link", rel="shortcut icon") if not iconLink: iconLink = ' ' else: iconLink = urljoin(url, iconLink.get('href')) textDB.objects.create(summaryText=summaryText, summary=summary, dateTime=timezone.now(), sourceUrl=url, title=title, icon=iconLink) scraper = Scraper() scraper.scrape(url) return HttpResponse("Successful")
def __summarize(self, parser): summarizer = LsaSummarizer(Stemmer(self.__language)) summarizer.stop_words = get_stop_words(self.__language) final_sentences = summarizer(parser.document, self.__sentences_count) return self.__join_sentences(final_sentences)
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 10 #nltk.download() if __name__ == "__main__": url = "http://money.cnn.com/2015/12/01/investing/premarket-stocks-trading/index.html?iid=hp-stack-dom" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def crawl(): print(datetime.now()) cursor = conn.cursor() cursor.execute("SET NAMES utf8mb4") cursor.execute( 'select id, name, feedUrl, lang, form, content_rss from sources where mod(id, 30)=mod(%s, 30)', (datetime.now().minute, )) sources = cursor.fetchall() start = time.clock() for source in sources: # if source['id']%30 == datetime.now().minute%30: print(source[0]) source = { 'id': source[0], 'name': source[1], 'feedUrl': source[2].replace("188.131.178.76", "127.0.0.1"), 'lang': source[3], 'form': source[4], 'content_rss': source[5] } print(source['name']) LANGUAGE = 'chinese' if source['lang'] == 2: LANGUAGE = 'english' items = feedparser.parse(source['feedUrl'])['items'] for item in items: cursor.execute('select 1 from entries where link = %s limit 1', (item['link'], )) results = cursor.fetchall() if (not results) or (len(results) == 0): entry = { 'title': item['title'], 'link': item['link'], 'source_id': source['id'], 'source_name': source['name'], 'time': datetime.fromtimestamp(mktime(item['published_parsed'])) + timedelta(hours=TZ_DELTA), 'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'photo': '', 'lang': source['lang'], 'author': item['author'], 'description': '', 'digest': '', 'content': '' } if 'content' in item: entry['content'] = item['content'][0]['value'] if entry['content'] == '': entry['content'] = item['summary'] if entry['content'] != '': entry['photo'] = getImg(entry['content']) if source['form'] == 1: if source['content_rss'] == 1 and entry['content'] != '': parser = HtmlParser.from_string( entry['content'], "", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break else: parser = HtmlParser.from_url(entry['link'], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break entry['digest'] = entry['digest'][0:500] cursor.execute(add_entry, entry) conn.commit() # print(d['feed']['title']) elapsed = time.clock() - start print('time used: ' + str(elapsed)) # 关闭Cursor和Connection: cursor.close()
import pickle from sumy.summarizers.lsa import LsaSummarizer from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words summarizer = LsaSummarizer(Stemmer('english')) summarizer.stop_words = get_stop_words('english') def lsa_summarizer(parser,sent_count): lsa_s = summarizer(parser.document, sent_count) summary_str = "" summary = [] for sent in lsa_s: summary_str += str(sent) summary.append(summary_str) len_summary = 0 for sent in summary: len_summary += len(sent.split()) return summary, len_summary def lsa_summaries (filepath,word_count): with open(filepath, 'rb') as filehandle: texts_str = pickle.load(filehandle) lsa_summary = [] for t in texts_str: parser = PlaintextParser(t, Tokenizer('english')) for i in range (len(t.split('.'))): summary,len_summary = lsa_summarizer(parser,i)
def __init__(self, input_texts: str): nltk.download('punkt') self.nlp = spacy.load('en_core_web_lg') self.summarizer = LsaSummarizer(Stemmer('english')) self.summarizer.stop_words = get_stop_words('english') self.cleaner = CleaningProcessor() self.synonyms: Dict[str, Optional[List[str]]] = {} if path.isfile('src/syns.yaml'): with open('src/syns.yaml', 'r') as f: self.synonyms = yaml.safe_load(f) if self.synonyms is None: self.synonyms = {} self.patterns: Dict[str, str] = OrderedDict() self.rev_patterns: Dict[str, str] = OrderedDict() with open('src/spreadr_shreddr/data.yaml', 'r') as f: data = yaml.safe_load(f) self.patterns.update(data['shorten']) self.patterns.update(data['expand']) data['filler'].extend( pycorpora.get_file('humans', 'prefixes')['prefixes']) self.patterns.update({k: '' for k in data['filler']}) for obj in pycorpora.get_file('words', 'compounds')['compounds']: key = '{} {}'.format(obj['firstWord'], obj['secondWord']) if key not in self.patterns: self.patterns[key] = obj['compoundWord'] self.patterns.update( {k.capitalize(): v.capitalize() for k, v in self.patterns.items()}) self.brits = data['brit_am'] self.murcans = {v: k for k, v in self.brits.items()} changed = False api = Datamuse() for text in input_texts: text >>= self.cleaner for sent in sent_tokenize(text): for index, word in enumerate(self.nlp(sent)): orth = word.orth_.lower() key = self.separator.join((orth, word.tag_)) if key not in self.synonyms: changed = True syns: List[str] = [] if (word.pos_ in UNIVERSAL_TO_DATAMUSE and len(wn.synsets(orth)) <= 1): res = api.words(ml=orth) if len(res) > 0: syns = self._get_synonyms( ' '.join(sent), (index, word), res) if len(syns) > 1: self.synonyms[key] = syns else: self.synonyms[key] = None if changed: changed = False with open('src/syns.yaml', 'a') as f: f.write(yaml.dump({key: self.synonyms[key]}))
def test_empty_document(): document = build_document() summarizer = TextRankSummarizer(Stemmer("english")) returned = summarizer(document, 10) assert len(returned) == 0
def summary(url, length, LANGUAGE): language = LANGUAGE.lower() e = str() #capture error article = Article(url) try: article.download() print (' successfully d/l') article.parse() raw_html = article.html image = article.top_image meta = article.meta_description text = article.text except Exception as e: print(e) if not text: print (' using Readability') raw_text = Readability(raw_html, url) text = raw_text.content article.download(html=text) article.parse() text = article.text if not meta: meta = article.title meta = unescape(unescape(meta)) meta = normalize('NFKD', meta) meta = meta.strip() image = image.replace('(', '\(') image = image.replace(')', '\)') image_des = '\n\n> [{0}]({1})'.format("**^pic**", image) if image else None parser = PlaintextParser(text, Tokenizer(language)) word_count = len(text.split()) compression = 100 extra_words = 0 stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) short = [] line = str() if word_count >= 600: length = length + int(log(word_count/600)) for sentence in summarizer(parser.document, length): if str(sentence).strip().lower() in meta.lower(): extra_words = len(str(sentence).split()) continue line = '>• {0}'.format(sentence) line = line.replace("`", "\'") line = line.replace("#", "\#") short.append(line) extract = '\n\n'.join(short) extract = extract + image_des if image_des else extract meta = meta.replace('#', '\#') if len(meta) > 400: lpoint = meta.rfind('.', 0, 400) if lpoint == -1: meta = meta[:(meta.rfind(' ', 0, 400))] + '...' else: meta = meta[:(meta.rfind('.', 0, 400))] + '...' try: compression = int(((extract.count(' ')+extra_words)/word_count)*100) except Exception as numerror: print(numerror) print(' from {0} words to {1} words ({2}%)'.format(word_count, len(extract.split()), compression)) return (meta, extract, compression, e)
def crawl(): print(datetime.now()) cursor = conn.cursor() cursor.execute("SET NAMES utf8mb4") cursor.execute('select id, name, feedUrl, lang, form from sources') sources = cursor.fetchall() start = time.clock() for source in sources: # if source['id']%30 == datetime.now().minute%30: print(source[0]) source = { 'id': source[0], 'name': source[1], 'feedUrl': source[2].replace("39.105.127.55", "127.0.0.1"), 'lang': source[3], 'form': source[4] } print(source['name']) LANGUAGE = 'chinese' if source['lang'] == 2: LANGUAGE = 'english' items = feedparser.parse(source['feedUrl'])['items'] for item in items: try: cursor.execute('select 1 from entries where link = %s limit 1', (item['link'], )) results = cursor.fetchall() if (not results) or (len(results) == 0): try: entry = { 'title': item['title'], 'link': item['link'], 'source_id': source['id'], 'source_name': source['name'], 'time': '', 'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'photo': '', 'lang': 1, 'author': '', 'description': '', 'digest': '', 'content': '', 'cluster': 0, 'sim_count': 0, 'simhash': '0', 'cate11': '', 'cate12': '', 'cate13': '', 'cate21': '', 'cate22': '', 'cate23': '', 'tag1': '', 'tag2': '', 'tag3': '', 'tag4': '', 'tag5': '', 'video': '', 'video_frame': '', 'audio': '', 'audio_frame': '' } cate1 = ['', '', ''] cate2 = ['', '', ''] tag = ['', '', '', '', ''] ############ Additonal Settings for special sources ############## if entry['source_name'] == 'Hacker News': entry['link'] = item['comments'] ########################### if is_en(entry['title']): entry['lang'] = 2 if 'published_parsed' in item: try: entry['time'] = datetime.fromtimestamp( mktime( item['published_parsed'])) + timedelta( hours=TZ_DELTA) except Exception as e: entry['time'] = entry['crawl_time'] print('Exception when published_parsed: {}'. format(e)) else: entry['time'] = entry['crawl_time'] if 'author' in item: entry['author'] = item['author'][0:20] if 'summary' in item: entry['description'] = item['summary'][0:500] if 'content' in item: entry['content'] = item['content'][0]['value'][ 0:15000] if entry['content'] == '' and 'summary' in item and len( item['summary']) > 0: entry['content'] = item['summary'][0:15000] for field in item['links']: if field['type'] == 'audio/mpeg': if field['href'].endswith('.mp3'): entry['audio'] = field['href'] if field['href'].endswith('.mp4'): entry['video'] = field['href'] #对于文章类entry才进行摘要、聚类、分类、标签 if source['form'] == 1: try: if entry['content'] != '': entry['photo'] = getImg(entry['content']) if len(entry['photo']) > 255: entry['photo'] = '' parser = HtmlParser.from_string( entry['content'], "", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words( LANGUAGE) for sentence in summarizer( parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break else: parser = HtmlParser.from_url( entry['link'], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words( LANGUAGE) for sentence in summarizer( parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break entry['digest'] = entry['digest'][0:500] except Exception as e: print( 'Exception when getting digest: {}'.format( e)) features = get_features(entry['title'], entry['content']) try: entry['simhash'] = str(Simhash(features).value) nears = index.get_near_dups(Simhash(features)) if len(nears) > 0: entry['sim_count'] = len(nears) cursor.execute( 'select cluster from entries where id = %s', (int(nears[0]), )) near_cluster = cursor.fetchone()[0] entry['cluster'] = near_cluster else: global last_cluster_num entry['cluster'] = last_cluster_num last_cluster_num += 1 except Exception as e: print( 'Exception when clustering: {}'.format(e)) try: content2 = BeautifulSoup( entry['content'], "lxml").text.encode( 'gbk', 'ignore').decode( 'gbk')[0:AIP_MAX_LEN_CONTENT] if len(content2) == 0: if len(entry['digest']) > 0: content2 = entry['digest'] title2 = entry['title'][0:AIP_MAX_LEN_TITLE] keywords = client.keyword(title2, content2) topics = client.topic(title2, content2) i = 0 for item in topics['item']['lv1_tag_list']: cate1[i] = item['tag'] i += 1 if i > 2: break i = 0 for item in topics['item']['lv2_tag_list']: cate2[i] = item['tag'] i += 1 if i > 2: break i = 0 for item in keywords['items']: tag[i] = item['tag'] i += 1 if i > 4: break entry['cate11'] = cate1[0] entry['cate12'] = cate1[1] entry['cate13'] = cate1[2] entry['cate21'] = cate2[0] entry['cate22'] = cate2[1] entry['cate23'] = cate2[2] entry['tag1'] = tag[0] entry['tag2'] = tag[1] entry['tag3'] = tag[2] entry['tag4'] = tag[3] entry['tag5'] = tag[4] except Exception as e: print( 'Exception when categorizing and tagging: {}' .format(e)) elif source['form'] == 2: entry['photo'] = getWeiboImg(entry['content']) entry['digest'] = filterWeiboTags(entry['content']) if len(entry['digest']) > 500: entry['digest'] = entry['digest'][0:500] elif source['form'] == 4: if entry['link'].startswith( 'https://www.bilibili.com/video'): entry['video_frame'] = 'http://player.bilibili.com/player.html?aid=' + \ entry['link'][33:] try: cursor.execute(add_entry, entry) conn.commit() index.add(str(cursor.lastrowid), Simhash(features)) except Exception as e: print('Exception when add entry: {}'.format(e)) except Exception as e: print("Unexpected Error: {}".format(e)) except Exception as e: print("Unexpected Error: {}".format(e)) # print(d['feed']['title']) elapsed = time.clock() - start print('time used: ' + str(elapsed)) # 关闭Cursor和Connection: cursor.close()
def main(argv): # Leer parametros archivo entrada y directorio salida try: opts, args = getopt.getopt(argv, "i:o:", ["inputFile=", "outputDirectory="]) except getopt.GetoptError: print('main.py -i <inputFile> -o <outputDirectory>') sys.exit(2) if (len(opts) != 2): print('main.py -i <inputFile> -o <outputDirectory>') sys.exit(2) PDF_SummaryDir = '' sourcePDFFile = '' for opt, arg in opts: if opt == '-h': print('main.py -i <inputFile> -o <outputDirectory>') sys.exit() elif opt in ("-i", "--inputFile"): sourcePDFFile = arg if os.path.exists(sourcePDFFile): print('[+] Archivo PDF encontrado') elif opt in ("-o", "--outputDirectory"): PDF_SummaryDir = arg #Check if the directory PDF_summary exists or not if not os.path.exists(PDF_SummaryDir): os.makedirs(PDF_SummaryDir) print('[+] Directorio creado') #Set parameters languages = ['spanish', 'english'] print('Seleccionar lenguaje') LANGUAGE = languages[cutie.select(languages)] print('[+] Lenguaje seleccionado') SENTENCES_COUNT = 30 algoritmos = ['Luhn', 'Lsa', 'LexRank', 'TextRank', 'SumBasic', 'KLsum'] print('Seleccionar algoritmo') chooseAlgo = algoritmos[cutie.select(algoritmos)] #create directories for output files outputPDFDir = os.path.dirname(PDF_SummaryDir + '/pdf/pdf_split_files/') if not os.path.exists(outputPDFDir): os.makedirs(PDF_SummaryDir + '/pdf/pdf_split_files/') outputTXTDir = os.path.dirname(PDF_SummaryDir + '/Text_Files/') if not os.path.exists(outputTXTDir): os.makedirs(PDF_SummaryDir + '/Text_Files/') outputSummaryDir = os.path.dirname(PDF_SummaryDir + '/Summary/') if not os.path.exists(outputSummaryDir): os.makedirs(PDF_SummaryDir + '/Summary/') #Name prefix for split files outputNamePrefix = 'Split_Chapter_' timeSuffixSummary = str(time.strftime("%d-%m-%Y_%H.%M.%S")) targetPDFFile = 'temppdfsplitfile.pdf' # Temporary file # Append backslash to output dir ofor pdf if necessary if not outputPDFDir.endswith('/'): outputPDFDir = outputPDFDir + '/' # Append backslash to output dir for txt if necessary if not outputTXTDir.endswith('/'): outputTXTDir = outputTXTDir + '/' # Append backslash to output dir ofor pdf if necessary if not outputSummaryDir.endswith('/'): outputSummaryDir = outputSummaryDir + '/' #Check and Verify if PDF is ready for splitting while not os.path.exists(sourcePDFFile): print('Source PDF not found, sleeping...') #Sleep time.sleep(10) if os.path.exists(sourcePDFFile): #print('Found source PDF file') #Copy file to local working directory shutil.copy(sourcePDFFile, targetPDFFile) #Process file #Create object and Open File in Read Binary Mode pdfFileObj2 = open(targetPDFFile, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj2) pdfFileObj = BookmarkToPageMap(pdfFileObj2) #Get total pages numberOfPages = pdfReader.numPages i = 0 newPageNum = 0 prevPageNum = 0 newPageName = '' prevPageName = '' for p, t in sorted([ (v, k) for k, v in pdfFileObj.getDestinationPageNumbers().items() ]): template = '%-5s %s' # To Check Page number and Title of the Chapter Uncomment the following lines ## print (template % ('Page', 'Title')) ## print (template % (p+1,t)) newPageNum = p + 1 newPageName = t if prevPageNum == 0 and prevPageName == '': # First Page prevPageNum = newPageNum prevPageName = newPageName else: # Next Page pdfWriter = PyPDF2.PdfFileWriter() page_idx = 0 for i in range(prevPageNum, newPageNum): pdfPage = pdfReader.getPage(i - 1) pdfWriter.insertPage(pdfPage, page_idx) # Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i)) page_idx += 1 # Creating names of split files pdfFileName = str(outputNamePrefix + prevPageName + '.pdf').replace(':', '_').replace('*', '_') txtFileName = str(outputNamePrefix + prevPageName + '.txt').replace(':', '_').replace('*', '_') # Writing each chapter to the .pdf file pdfOutputFile = open(outputPDFDir + pdfFileName, 'wb') pdfWriter.write(pdfOutputFile) pdfOutputFile.close() # Check : print('Created PDF file: ' + outputPDFDir + pdfFileName) # Calling convert function and writing each chapter to the .txt file txtOutputFile = open(outputTXTDir + txtFileName, 'w') txtOutputFile.write(convert(outputPDFDir + pdfFileName)) txtOutputFile.close() # Check :print('Created TXT file: ' + outputTXTDir + txtFileName) # for plain text files create Summary parser = PlaintextParser.from_file(outputTXTDir + txtFileName, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # Using LsaSummarizer to create summary ## Select from different algorithms to create summary by using different algorithms if chooseAlgo == 'Lsa': summarizer = Lsa(stemmer) elif chooseAlgo == 'LexRank': summarizer = LexRank(stemmer) elif chooseAlgo == 'TextRank': summarizer = TextRank(stemmer) elif chooseAlgo == 'Luhn': summarizer = Luhn(stemmer) elif chooseAlgo == 'SumBasic': summarizer = SumBasic(stemmer) elif chooseAlgo == 'KLsum': summarizer = KLsum(stemmer) else: print('Wrong Algorithm selected.') sys.exit(0) summarizer.stop_words = get_stop_words(LANGUAGE) # Open file in append mode so that summary will be added at the bottom of file summaryOutputFile = open( outputSummaryDir + chooseAlgo + '_Summary_File' + timeSuffixSummary + '.txt', 'a') for sentence in summarizer(parser.document, SENTENCES_COUNT): # Check : print (sentence) summaryOutputFile.write(str(sentence)) # To create Separation between Chapters summaryOutputFile.write( str('\n\n' + 'Title : ' + t + '\n' + '\t')) summaryOutputFile.close() i = prevPageNum prevPageNum = newPageNum prevPageName = newPageName # Split the last page pdfWriter = PyPDF2.PdfFileWriter() page_idx = 0 for i in range(prevPageNum, numberOfPages + 1): pdfPage = pdfReader.getPage(i - 1) pdfWriter.insertPage(pdfPage, page_idx) # Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i)) page_idx += 1 pdfFileObj2.close() print('[+] Archivo creado: ' + outputSummaryDir + 'SummaryFile.txt') # Delete temp file os.unlink(targetPDFFile)
def analyze(parser): stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
# instantiate Slack client slack_client = SlackClient(os.environ.get('SLACK_BOT_TOKEN')) # starterbot's user ID in Slack: value is assigned after the bot starts up starterbot_id = None # constants RTM_READ_DELAY = 1 # 1 second delay between reading from RTM EXAMPLE_COMMAND = "do" MENTION_REGEX = "^<@(|[WU].+?)>(.*)" ARXIV_REGEX = r'(https?://[^\s]+[0-9]+)' USER_HANDLE = 'danfei' # your username on slack # for sumy lang = 'english' tknz = Tokenizer(lang) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) def summarize(string, num_sentence=3): """ Summarize a sentence with sumy """ parser = PlaintextParser(string, tknz) parser.stop_word = get_stop_words(lang) summ_string = '' for sentence in summarizer(parser.document, num_sentence): summ_string += str(sentence) + ' ' return summ_string
def process_data(text): text_data = unidecode.unidecode(text) clean_list, pure_list = prepare_for_regex(text_data) data_to_summarize = [] for clean, pure in zip(clean_list, pure_list): if re.findall(clause, clean): data_to_summarize.append(pure) text_data = " ".join(data_to_summarize) parser = PlaintextParser(text_data, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextRankSummarizer(stemmer) summary = summarizer(parser.document, SENTENCES_COUNT) sentences = [] for sentence in summary: skip = False for punct in ["[", "]", "{", "}", "=", "+", "_", "|", "<", ">", "^"]: if punct in str(sentence): skip = True if skip: continue if str(sentence)[-1] == "." and len(str(sentence)) < 500: try: int(str(sentence)[0]) sentence = str(sentence)[1:].strip() except: sentence = str(sentence).strip() sent = nlp(sentence) sentence = "" entities = [] for token in sent: if sentence and token.text.strip() not in string.punctuation: sentence += " " if token.text.lower() in VERBS: sentence += '''<mark class="entity" style="background: #ffffb3; padding: 0.2em 0.2em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">{}</mark>'''.format( token.text) entities.append(token.text) else: sentence += token.text sentence = sentence[:-1] + ". " for ent in sent.ents: if ent.text == "IP": continue if ent.text not in entities and ent.label_ == "ORG" or ent.text.lower( ) == "stripe": sentence = sentence.replace( ent.text, '''<mark class="entity" style="background: #ccffcc; padding: 0.2em 0.2em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">{}</mark> ''' .format(ent.text)) entities.append(ent.text) for term in KEYWORDS: if term in entities: continue case_insensitive = re.compile(re.escape(term), re.IGNORECASE) sentence = case_insensitive.sub( '''<mark class="entity" style="background: #b3d9ff; padding: 0.2em 0.2em;; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">{}</mark>''' .format(term), sentence) sentences.append({ "text": sentence, "rating": get_sentiment(sentence) }) readability_score, complexity_score = get_readability(text_data) return jsonify({ "summary_sentences": sentences, "readability_score": readability_score, "complexity_score": complexity_score })
def tool(request, tool_name): tool = get_object_or_404(models.Tool, url_endpoint__iexact=tool_name) context = {'tool': tool} category = tool.category.replace(" ", "_").lower() print(tool.template_name, " template ", sep=" ") print('tools/{0}/{1}'.format(category, tool.template_name)) if (request.method == 'POST' and tool_name == "resume_builder"): pdf = FPDF() pdf.add_page() pdf.set_auto_page_break(auto=bool, margin=5) name = request.POST.get('asliname') pdf.set_top_margin(margin=5) pdf.set_font('Arial', 'B', 35) pdf.multi_cell(w=0, h=13, txt=name.strip().rstrip("\n\r"), align='C') address = request.POST.get('address') email = request.POST.get('email') phone = request.POST.get('phone') about = request.POST.get('about') pdf.set_font('Times', '', 16) pdf.multi_cell(w=0, h=8, txt=address.strip().rstrip("\n\r"), align='C') pdf.multi_cell(w=0, h=8, txt=email.strip().rstrip("\n\r"), align='C') pdf.multi_cell(w=0, h=8, txt=phone.strip().rstrip("\n\r"), align='C') pdf.multi_cell(w=0, h=10, txt=about.strip().rstrip("\n\r"), align='C') #pdf.line(5,55,205,55) pdf.multi_cell(w=0, h=9, txt=" ", align='C') #All Correctly Retrieving pdf.set_font('Arial', 'B', 20) pdf.multi_cell(w=0, h=10, txt="Profile", align='L') profile = request.POST.get('totalprofile') profile = int(profile) #Total pro1 = request.POST.get('temp') pdf.set_font('Times', '', 15) #if ( type(pro1) == 'str' ): pdf.multi_cell(w=0, h=7, txt=pro1.strip().rstrip("\n\r"), align='L') #print(pro1," PROFILE ",sep=" ") lineheight = 81 for i in range(1, profile): temp = "temp" + str(i) rest = request.POST.get(temp) pdf.multi_cell(w=0, h=7, txt=rest.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 #print(rest," PROFILE ",sep=" ") #pdf.line(5,lineheight,205,lineheight) pdf.multi_cell(w=0, h=9, txt=" ", align='C') #All Correctly Retrieving pdf.set_font('Arial', 'B', 20) pdf.multi_cell(w=0, h=10, txt="Work Experience", align='L') work = request.POST.get('totalwork') work = int(work) #Total com1 = request.POST.get("companyplate2") pos1 = request.POST.get("positionplate2") dur1 = request.POST.get("durationplate2") wor1 = request.POST.get("workdoneplate2") #if ( type(com1) == 'str' ): pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=com1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=pos1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=dur1.strip().rstrip("\n\r"), align='L') pdf.multi_cell(w=0, h=7, txt=wor1.strip().rstrip("\n\r"), align='L') #print(com1,pos1,dur1,wor1,sep=" ") lineheight = lineheight + 47 pdf.multi_cell(w=0, h=3, txt=" ", align='C') for i in range(1, work): comp = "companyplate2" + str(i) rest1 = request.POST.get(comp) pos = "positionplate2" + str(i) rest2 = request.POST.get(pos) dur = "durationplate2" + str(i) rest3 = request.POST.get(dur) wor = "workdoneplate2" + str(i) rest4 = request.POST.get(wor) pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest2.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=rest3.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=7, txt=rest4.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(rest1,rest2,rest3,rest4,sep=" ") #pdf.line(5,lineheight,205,lineheight) pdf.multi_cell(w=0, h=9, txt=" ", align='C') #All Correctly Retrieving pdf.set_font('Arial', 'B', 20) pdf.multi_cell(w=0, h=10, txt="Education", align='L') edu = request.POST.get('totaledu') edu = int(edu) #Total ini1 = request.POST.get("institutiontemplate3") score1 = request.POST.get("scoretemplate3") edudur1 = request.POST.get("duration2template3") #if ( type(ini1) == 'str' ): pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=ini1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=edudur1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=score1.strip().rstrip("\n\r"), align='L') #print(ini1,score1,edudur1,sep=" ") lineheight = lineheight + 40 pdf.multi_cell(w=0, h=3, txt=" ", align='C') for i in range(1, edu): ini = "institutiontemplate3" + str(i) rest1 = request.POST.get(ini) score = "scoretemplate3" + str(i) rest2 = request.POST.get(score) edudur = "duration2template3" + str(i) rest3 = request.POST.get(edudur) pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest3.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=rest2.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(rest1,rest2,rest3,sep=" ") #pdf.line(5,lineheight,205,lineheight) pdf.multi_cell(w=0, h=9, txt=" ", align='C') #All Correctly Retrieving pdf.set_font('Arial', 'B', 20) pdf.multi_cell(w=0, h=10, txt="Projects", align='L') proj = request.POST.get('totalproj') proj = int(proj) #Total project1 = request.POST.get("projectlate2") tech1 = request.POST.get("techlate2") projdur1 = request.POST.get("projectdurationlate2") projwor1 = request.POST.get("projectdonelate2") #if ( proj >= 1 ): pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=project1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=tech1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=projdur1.strip().rstrip("\n\r"), align='L') pdf.multi_cell(w=0, h=7, txt=projwor1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 47 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(project1,tech1,projdur1,projwor1,sep=" ") for i in range(1, proj): project = "projectlate2" + str(i) rest1 = request.POST.get(project) tech = "techlate2" + str(i) rest2 = request.POST.get(tech) projdur = "projectdurationlate2" + str(i) rest3 = request.POST.get(projdur) projwor = "projectdonelate2" + str(i) rest4 = request.POST.get(projwor) pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest2.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=rest3.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=7, txt=rest4.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(rest1,rest2,rest3,rest4,sep=" ") #pdf.line(5,lineheight,205,lineheight) pdf.multi_cell(w=0, h=9, txt=" ", align='C') #All Correctly Retrieving pdf.set_font('Arial', 'B', 20) pdf.multi_cell(w=0, h=10, txt="Skills", align='L') skill = request.POST.get('totalskill') skill = int(skill) #Total skill1 = request.POST.get("skillkill") diffi1 = request.POST.get("kill") print(skill) if (diffi1 == 'Intermediate' or diffi1 == 'Advance' or diffi1 == 'Beginner'): pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=skill1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=diffi1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 33 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(skill1,diffi1,sep=" ") for i in range(1, skill): skill = "skillkill" + str(i) rest1 = request.POST.get(skill) diffi = "kill" + str(i) rest2 = request.POST.get(diffi) pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=rest2.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(rest1,rest2,sep=" ") #pdf.line(5,lineheight,205,lineheight) pdf.multi_cell(w=0, h=9, txt=" ", align='C') #All Correctly Retrieving pdf.set_font('Arial', 'B', 20) pdf.multi_cell(w=0, h=10, txt="Awards", align='L') award = request.POST.get('totalaward') award = int(award) #Total achieve1 = request.POST.get("achievementward") awarddone1 = request.POST.get("awarddoneward") #if ( type(achieve1) == 'str' ): pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=achieve1.strip().rstrip("\n\r"), align='L') pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=awarddone1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 33 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(achieve1,awarddone1,sep=" ") for i in range(1, award): achieve = "achievementward" + str(i) rest1 = request.POST.get(achieve) awarddone = "awarddoneward" + str(i) rest2 = request.POST.get(awarddone) pdf.set_font('Times', 'B', 15) pdf.multi_cell(w=0, h=7, txt=rest1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.set_font('Times', '', 15) pdf.multi_cell(w=0, h=7, txt=rest2.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(rest1,rest2,sep=" ") #pdf.line(5,lineheight,205,lineheight) pdf.multi_cell(w=0, h=9, txt=" ", align='C') #All Correctly Retrieving pdf.set_font('Arial', 'B', 20) pdf.multi_cell(w=0, h=10, txt="Interests", align='L') inter = request.POST.get('totalinter') inter = int(inter) #Total interest1 = request.POST.get("inter") pdf.set_font('Times', '', 15) #if ( type(interest1) == 'str' ): pdf.multi_cell(w=0, h=7, txt=interest1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 26 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(interest1,sep=" ") for i in range(1, inter): interest = "rest" + str(i) rest1 = request.POST.get(interest) pdf.multi_cell(w=0, h=7, txt=rest1.strip().rstrip("\n\r"), align='L') lineheight = lineheight + 7 pdf.multi_cell(w=0, h=3, txt=" ", align='C') #print(rest1,sep=" ") dirspot = os.getcwd() print(dirspot + " DIRECTORY DEKHO ") pdf.output(dirspot + '/resume.pdf', 'F') filename = dirspot + '/resume.pdf' #return FileResponse(as_attachment=True, filename=dirspot+'/resume.pdf') return FileResponse(open(filename, 'rb'), content_type='application/pdf') if (request.method == "POST" and tool_name == "text_summary"): print(" POST DEKHO ", tool_name, sep=" ") inp = request.POST.get('input') aslinp = inp lang = request.POST.get('Languages') algo = request.POST.get('Algorithm') percen = request.POST.get('percentage') parser = PlaintextParser.from_string(inp, Tokenizer(lang)) stemmer = Stemmer(lang) if (algo == "Edmundson"): summarizer = Summarizer0(stemmer) summarizer.stop_words = get_stop_words(lang) elif (algo == "Latent Semantic Analysis"): summarizer = Summarizer1(stemmer) summarizer.stop_words = get_stop_words(lang) elif (algo == "LexRank"): summarizer = Summarizer2(stemmer) summarizer.stop_words = get_stop_words(lang) elif (algo == "TextRank"): summarizer = Summarizer3(stemmer) summarizer.stop_words = get_stop_words(lang) elif (algo == "Luhn"): summarizer = Summarizer4(stemmer) summarizer.stop_words = get_stop_words(lang) elif (algo == "SumBasic"): summarizer = Summarizer5(stemmer) summarizer.stop_words = get_stop_words(lang) elif (algo == "KL-Sum"): summarizer = Summarizer6(stemmer) summarizer.stop_words = get_stop_words(lang) summary = [] for sentence in summarizer(parser.document, percen): summary.append(sentence) result = ''.join(str(v) for v in summary) print(result) context = {'tool': tool, 'summary': result, 'asliinp': aslinp} return render(request, 'tools/text_tools/text_summary.html', context) else: return render(request, 'tools/{0}/{1}'.format(category, tool.template_name), context)
from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import hebrew_tokenizer as ht import requests import string # for removing punctuations from text translator = str.maketrans("", "", string.punctuation) # Increase this variable to get more summary and reduce this to get less summary SENTENCES_COUNT = 3 if __name__ == "__main__": # url in hebrew language url = "https://he.wikipedia.org/wiki/%D7%93%D7%95%D7%A0%D7%9C%D7%93_%D7%98%D7%A8%D7%90%D7%9E%D7%A4" parser = HtmlParser.from_url(url, None) stemmer = Stemmer("Hebrew") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("Hebrew") # Must set the encoding to [utf-8] for hebrew language file = open("output.txt", "w", encoding='utf-8') for sentence in summarizer(parser.document, SENTENCES_COUNT): # printing the text from the sentence object print(sentence._text) # writing the result into the file as well file.write(sentence._text) file.close()
def text_summary(): def set_stopwords(stopwords): """Set stop words""" for word in STOP_WORDS.union(set(stopwords)): lexeme = nlp.vocab[word] lexeme.is_stop = True def sentence_segment(doc, candidate_pos, lower): """Store those words only in cadidate_pos""" sentences = [] for sent in doc.sents: selected_words = [] for token in sent: # Store words only with cadidate POS tag if token.pos_ in candidate_pos and token.is_stop is False: if lower is True: selected_words.append(token.text.lower()) else: selected_words.append(token.text) sentences.append(selected_words) return sentences def get_vocab(sentences): """Get all tokens""" vocab = OrderedDict() i = 0 for sentence in sentences: for word in sentence: if word not in vocab: vocab[word] = i i += 1 return vocab def get_token_pairs(window_size, sentences): """Build token_pairs from windows in sentences""" token_pairs = list() for sentence in sentences: for i, word in enumerate(sentence): for j in range(i + 1, i + window_size): if j >= len(sentence): break pair = (word, sentence[j]) if pair not in token_pairs: token_pairs.append(pair) return token_pairs def symmetrize(a): return a + a.T - np.diag(a.diagonal()) def get_matrix(vocab, token_pairs): """Get normalized matrix""" # Build matrix vocab_size = len(vocab) g = np.zeros((vocab_size, vocab_size), dtype='float') for word1, word2 in token_pairs: i, j = vocab[word1], vocab[word2] g[i][j] = 1 # Get Symmeric matrix g = symmetrize(g) # Normalize matrix by column norm = np.sum(g, axis=0) g_norm = np.divide( g, norm, where=norm != 0) # this is ignore the 0 element in norm return g_norm def analyze(text, candidate_pos=['NOUN', 'PROPN', 'VERB'], window_size=4, lower=False, stopwords=list(), number=10): """Main function to analyze text""" # Set stop words set_stopwords(stopwords) # Pare text by spaCy doc = nlp(text) # Filter sentences sentences = sentence_segment(doc, candidate_pos, lower) # list of list of words # Build vocabulary vocab = get_vocab(sentences) # Get token_pairs from windows token_pairs = get_token_pairs(window_size, sentences) # Get normalized matrix g = get_matrix(vocab, token_pairs) # Initionlization for weight(pagerank value) pr = np.array([1] * len(vocab)) # Iteration previous_pr = 0 for epoch in range(steps): pr = (1 - d) + d * np.dot(g, pr) if abs(previous_pr - sum(pr)) < min_diff: break else: previous_pr = sum(pr) # Get weight for each node node_weight = dict() for word, index in vocab.items(): node_weight[word] = pr[index] node_weight = node_weight node_weight = OrderedDict( sorted(node_weight.items(), key=lambda t: t[1], reverse=True)) keyword = [] for i, (key, value) in enumerate(node_weight.items()): keyword.append(key) #print(key + ' - ' + str(value)) if i > number: break return keyword def command_detected(sentence): # Detects whether a given String sentence is a command or action-item tagged_sentence = pos_tag(word_tokenize(sentence)) first_word = tagged_sentence[0] pos_first = first_word[1] first_word = first_word[0].lower() for word in prohibited_command_words: if word in sentence: return False for word in command_words: if word in sentence: return True # Checks whether the first sentence is a Modal Verb or other type of Verb that is not a gerund if (pos_first == "VB" or pos_first == "VBZ" or pos_first == "VBP") and first_word[-3:] != "ing": return True return False def retrieve_action_items(): # Returns a list of the sentences containing action items. action_items = [] for sentence in tokenized_transcript: possible_command = command_detected(str(sentence)) if possible_command is True: action_items += [(str(sentence))] return action_items text = request.json text = text['data'].replace('Speaker ', '') source = re.sub(r'\d\s+\d{1,2}\:\d{2}', '', text) source = re.sub(r'\s+', ' ', source) Keywords = analyze(source, candidate_pos=['NOUN', 'PROPN', 'VERB'], window_size=4, lower=False) tokenized_transcript = sent_tokenize(source) LANGUAGE = "English" parser = PlaintextParser.from_string(source, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = summarizer(parser.document, len(tokenized_transcript) * 0.05) transcript_summary = [] for sentence in summary: transcript_summary.append(str(sentence)) command_words = [ "can you", "would you", "can we", "you should", "we should", "we need to", "you need to", "ensure", "make sure", "make it", "we want to", "we must", "you must", "you have to", "we have to" "homework" ] prohibited_command_words = ["Let me", "?"] Action_item = retrieve_action_items() result = { "keywords :": Keywords, 'Summary :': transcript_summary, 'Action Items :': Action_item } return jsonify(result)
def summary(article_url): url = article_url #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato" # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage" # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident" # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # create a list of reference sentences to calculate ROUGE_N scores ref_sentences = [] trim_ref_sentences = [] for paragraph in parser._article.main_text: for sections in paragraph: for sentences in sections: try: if len(sentences) > 35: # trim off super short - likely a few word sentences ref_sentences.append(sentences) except TypeError: # catch type errors caused by annotated text ie h1, b, etc print("typeError") continue trim_ref_sentences.extend( Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # define summarizers for the summarizing methods being used summarizer_Lsa = Lsa(stemmer) summarizer_Lsa.stop_words = get_stop_words(LANGUAGE) summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT) summarizer_LexRank = LexRank() summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT) summarizer_Edmundson = Edmundson(stemmer) summarizer_Edmundson.null_words = get_stop_words(LANGUAGE) summarizer_Edmundson.bonus_words = parser.significant_words summarizer_Edmundson.stigma_words = parser.stigma_words summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT) # print summaries summary_Lsa_trim = [] for sentence in summary_Lsa: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Lsa_trim.append(sentence) # calc rouge_n scores calc_value(summary_Lsa_trim, trim_ref_sentences) print('\n') summary_LexRank_trim = [] for sentence in summary_LexRank: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_LexRank_trim.append(sentence) # calc rouge_n scores calc_value(summary_LexRank_trim, trim_ref_sentences) print('\n') summary_Edmundson_trim = [] for sentence in summary_Edmundson: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Edmundson_trim.append(sentence) # calc rouge_n scores calc_value(summary_Edmundson_trim, trim_ref_sentences) # returns index of max 0=Ed, 1=Lsa, 2=Lex models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"} best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim, summary_Edmundson_trim, trim_ref_sentences) print( models.get(best_summary) + ' is the best model according to an average of the Rouge_3, 2 and 1 tests' ) #return the summary of the best model if (best_summary == 0): return summary_Edmundson_trim elif (best_summary == 1): return summary_Lsa_trim elif (best_summary == 2): return summary_LexRank_trim
def test_empty_document(self): document = build_document() summarizer = TextRankSummarizer(Stemmer("english")) returned = summarizer(document, 10) self.assertEqual(len(returned), 0)
def test_english_stemmer(self): english_stemmer = Stemmer('english') self.assertEqual("beauti", english_stemmer("beautiful"))
with open(reference_filename) as fref: refs = fref.read() extractive_references = ' '.join(sent_tokenize(refs)[0:5]) abstractive_references = ' '.join(sent_tokenize(refs)[-4:]) ex_reference_document = PlaintextParser.from_string( extractive_references, Tokenizer('english')) abs_reference_document = PlaintextParser.from_string( abstractive_references, Tokenizer('english')) ex_reference_sentences = ex_reference_document.document.sentences abs_reference_sentences = abs_reference_document.document.sentences # Read input file to be summarized comm_parser = PlaintextParser.from_file(input_filename, Tokenizer('english')) stemmer = Stemmer('english') # Get list of sentences in the original commentary orig_text = comm_parser.document.sentences # Open output file for writing fout = open(output_filename, 'w') # Make baseline summary baseline_summary = baseline(input_filename) fout.write("BASELINE: \n") fout.write(baseline_summary + "\n\n") print("Summarizing using Algorithm: Baseline \n") base_summary_sentences = PlaintextParser.from_string( baseline_summary, Tokenizer('english')).document.sentences
def test_german_stemmer(self): german_stemmer = Stemmer('german') self.assertEqual("sterb", german_stemmer("sterben"))
fo.write(' negative_score: ' + str(negative_score)) fo.write('\n') featureObs[featCount].summaryOp.append(line) if (positive_score >= negative_score): fw.write(line.rstrip() + ' -> Positive') featureObs[featCount].positiveOp.append(line) else: fw.write(line.rstrip() + ' -> Negative') featureObs[featCount].negativeOp.append(line) fw.write('\n') featCount += 1 fp.seek(0) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) with open('Feature_Review.txt', 'w') as fw: for i in range(100): fw.write('\n\n\n*****************************\n') fw.write('FEATURE: ' + featureObs[i].featureName + ' POSITIVE OPINIONS: ' + str(len(featureObs[i].positiveOp)) + ' NEGATIVE OPINIONS: ' + str(len(featureObs[i].negativeOp))) fw.write('\n*****************************\n') # fw.write('\n*******SUMMARY REVIEW*******\n') # parser = PlaintextParser(''.join(featureObs[i].summaryOp),Tokenizer("english")) # for sentence in summarizer(parser.document,50) : # fw.write(str(sentence)) fw.write('\n*******POSITIVE REVIEWS*******\n')
def test_czech_stemmer(self): czech_stemmer = Stemmer('czech') self.assertEqual("pěkn", czech_stemmer("pěkný"))
def gaz(type_df, time, cut, many): nlp = spacy.load('en') if cut == "True": type_df = type_df[type_df["Review Date"] > time] else: type_df = type_df[type_df["Review Date"] < time] sample_review = "" for i in type_df["review"]: sample_review = sample_review + " " + str(i) # print(sample_review) len(sample_review) sample_review = sample_review.replace("\\", "") #### Summary: ### Summaries import sumy from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.summarizers.text_rank import TextRankSummarizer from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words lexi = LexRankSummarizer(Stemmer("english")) texi = TextRankSummarizer(Stemmer("english")) parser = PlaintextParser.from_string(sample_review, Tokenizer("english")) texi = TextRankSummarizer(Stemmer("english")) rentence = "dddd" for sentence in texi(parser.document, 10): # This does indeed summarise the document if (str(rentence).split()[len(str(rentence).split()) - 1][-1] == ".") and (len(rentence) > 2): rentence = rentence + " " + str(sentence) elif len(rentence) < 3: rentence = rentence + " " + str(sentence) else: rentence = rentence + ". " + str(sentence) stop_words = set(stopwords.words('english')) stop_words.update(['.', ',', '"', "'", '?', '!', '! !', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation list_of_words = [i.lower() for i in wordpunct_tokenize(sample_review) if i.lower() not in stop_words] final = ' '.join(list_of_words) from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') list_of_words = tokenizer.tokenize(final) final = ' '.join(list_of_words) parsed_review = nlp(final) # print(parsed_review) token_text = [token.orth_ for token in parsed_review] token_pos = [token.pos_ for token in parsed_review] df = pd.DataFrame({'token_text': token_text, 'part_of_speech': token_pos}) # Unigrams import nltk from nltk import word_tokenize from nltk.util import ngrams from collections import Counter token = nltk.word_tokenize(str(parsed_review)) grams = ngrams(token, many) dra = Counter(grams) t = pd.DataFrame() f = pd.DataFrame(list(dra.keys())) if many == 2: f[0] = f[0] + " " + f[1] if many == 3: f[0] = f[0] + " " + f[1] + " " + f[2] f = f[0] t["name"] = f t["count"] = list(dra.values()) df = df.drop_duplicates() r = pd.merge(t, df, left_on=["name"], right_on=["token_text"], how="left", right_index=False) r = r.drop("token_text", axis=1) r.columns = ["name", "count", "pos"] scaler = MinMaxScaler() r["norm"] = scaler.fit_transform(r["count"].values.reshape(-1, 1)) if many == 1: dfs = r[r["pos"] == "NOUN"].sort_values("count", ascending=False) else: dfs = r.sort_values("count", ascending=False) return dfs, rentence
def test_french_stemmer(self): french_stemmer = Stemmer('czech') self.assertEqual("jol", french_stemmer("jolies"))
map(lambda x: os.path.join("../data/reviews", x), in_files)) test_input = "hahahahahahahahahaha this is the most funny film I have ever seen" # In[10]: file_data = None with open(in_file, 'r') as f: file_data = f.read() # In[11]: parser = PlaintextParser.from_file(in_file, Tokenizer(LANGUAGE)) summarizer = TextRankSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words('slovak') helper = _summarizer.AbstractSummarizer() # In[36]: explanator = anchor_text.AnchorText(nlp, ['negative', 'positive'], use_unk_distribution=True) # In[13]: # define a decorator to log execusion time # inspired by https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d
def test_slovak_stemmer(self): expected = Stemmer("czech") actual = Stemmer("slovak") self.assertEqual(type(actual), type(expected)) self.assertEqual(expected.__dict__, actual.__dict__)
def stem(word, LANGUAGE = "portuguese"): stemmer = Stemmer(LANGUAGE) return stemmer(to_unicode(word).lower())
def data_pre_train_mongo( data_path='data/data.json',train_path='data/train_db.txt' ): """ from=0 #文章开始id limit=10 # 返回文章数目 >>>data_pre_train(from=0, limit=10) [unused5] 标记关键词 [unused6] 标记标题 [unused7] 标记前文标题 [unused8] 标记正文 """ LANGUAGE = "chinese" SENTENCES_COUNT = 10 article_max_len=500 ttext=tkitText.Text() jieba.load_userdict('dict.txt') jieba.analyse.set_stop_words('stopwords.txt') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) # ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0") f1 = open(train_path,'w') # articles=[] tt=tkitText.Text() # 引入TF-IDF关键词抽取接口 tfidf = analyse.extract_tags # 引入TextRank关键词抽取接口 textrank = analyse.textrank #这里定义mongo数据 client = pymongo.MongoClient("localhost", 27017) DB_kg_scrapy = client.kg_scrapy print(DB.name) q={} # print('q',q) tclass = classify(model_name_or_path='tkitfiles/check_pet',num_labels=10,device='cuda') Ner=get_ner() # nlp=Nlp() i=0 # for item in DB_kg_scrapy.kg_content.find(q): tjson=tkitFile.Json(file_path=data_path) for item in tqdm(tjson.auto_load()): i=i+1 if i%10000==0: print(i) # print(item) if len(item['content'])>500: SENTENCES_COUNT = 5 else: SENTENCES_COUNT = 3 parser = PlaintextParser.from_string(item['content'], Tokenizer(LANGUAGE)) l=[] words_list=[] for sentence in summarizer(parser.document, SENTENCES_COUNT): l.append(str(sentence)) # ner_list=Ner.pre(str(sentence)) # for it in ner_list[0][1]: # words_list.append(it.get("words")) # keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False,) keyphrases =tt.get_keyphrases(item['title']+'\n'+item['content']) # print("==="*20) # print("",item['title']) # print(item['content'][:100]) p=tclass.pre(item['content']) # print("预测结果",p) # softmax=tclass.softmax() # print(softmax) # sentences=tt.sentence_segmentation_v1( item['title']+'。'+item['content']) # words_list=[] # for sentence in sentences: # ner_list=Ner.pre(sentence) # for it in ner_list[0][1]: # words_list.append(it.get("words")) # # print(words_list) # keywords=keywords+keyphrases+words_list keywords=keywords+keyphrases keywords=list(set(keywords)) # print(ner_list) content=" [KW] "+",".join(keywords)+" [/KW] [TT] "+ item['title']+" [/TT] [SM] "+"".join(l)+" [/SM] [CONTNET] "+item['content']+" [/CONTNET] [PT] "+ item['title']+" [/PT] [END]" content=content.replace("\n\n\n", "\n\n") content=content.replace("\n", " [SEP] ") # print(content[:100]) # content_list=cut_text(content,480) # for it in content_list: # print("++++"*20) # print(it) # f1.write("\n".join(content_list)+"") if p==1: f1.write(content) f1.write("\n")