Beispiel #1
0
def data_pre_train( data_path='data/data.json',train_path='data/train.txt' ):
    """
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)
    [unused5] 标记关键词
      [unused6]  标记标题
    [unused7]  标记前文标题
       [unused8]  标记正文
    """
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 10
    article_max_len=500
    # tjson=tkit.Json(file_path=data_path)
    # data=tjson.auto_load()
    # print(len(data))
    ttext=tkitText.Text()
    # extractor = tkit.TripleExtractor()
    # if len(data)>tfrom+limit:
    #     data=data[tfrom:tfrom+limit]
    # elif len(data)<tfrom:
    #     print("数据过短了,存在问t")
    #     return []
    # else:
    #     data=data[tfrom:]
    # for item in tjson.auto_load():
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0")
    f1 = open(train_path,'w')
    articles=[]
    # 引入TF-IDF关键词抽取接口
    tfidf = analyse.extract_tags
    # 引入TextRank关键词抽取接口
    textrank = analyse.textrank
    with open(data_path, 'r', encoding = 'utf-8') as data:
        for art_i,it in tqdm(enumerate(data)):
            item=json.loads(it[:-1])
            # if art_i%10==0:
            #     print('arti', art_i)
            segs_pre=[]
            segs_end=[]
            # # segs_pre.append(' [KW] '+item['keywords']+' [SEP] ')
            # # l=ttext.summary( item['content'],num=10)
            # # extractor = tkit.TripleExtractor()
            # # svos = extractor.triples_main(item['content'])

            # # extractor.clear()
            # # print('svos', svos)
            # parser = PlaintextParser.from_string(item['content'], Tokenizer(LANGUAGE))
            # l=[]
            # for sentence in summarizer(parser.document, SENTENCES_COUNT):
            #     l.append(str(sentence))
            # # del sentence
            s=[]      




            # # 这里开始处理关键词 关键语句等信息 
            # try:
            #     for it in ie.get(item['title']+'\n'+item['content']):
            #         # print(it)
            #         if it==None:
            #             pass
            #         else:
            #             s.append(''.join(list(it)))
            #     # print(s)
            # except:
            #     pass
            # # s=get_seq(item['title']+'\n'+item['content'])
            # # 基于TextRank算法进行关键词抽取
            keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 
            # # 输出抽取出的关键词
            # # print(keywords)
            # # for keyword in keywords:
            # #     print (keyword + "/",)
            # # 基于TF-IDF算法进行关键词抽取
            # # keywords = tfidf(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
            # # print(keywords)
            # # 输出抽取出的关键词
            # # for keyword in keywords:
            # #     print( keyword + "/",)
            # # keywords1 =ttext.get_keywords(item['title']+'\n'+item['content'])
            # # new_keywords=[]
            # # for keyword in keywords1:
            # #     new_keywords.append(keyword['word'])        
            # # keywords =ttext.get_keyphrases(item['title']+'\n'+item['content'])
            # # kws=keywords+new_keywords
            # # # s.append(','.join(kws))
            # s=[','.join(keywords)]+s
            segs_pre.append(' [KW] '+','.join(keywords)+' [/KW] ')
            # del s
            # # svos = extractor.triples_main('。'.join(l))
            # #这里的屏蔽内容









            try:
                segs_pre.append(' [TT] '+item['title']+" [/TT] ")
                segs_end.append(' [PT] '+item['title']+" [/PT] ")
            except:
                pass
            segs=sentence_seg(" [CLS] "+item['content']+" [END] ")
            article="".join(segs_pre+segs+segs_end)
            
            one=[]
            for i in range(len(article)//article_max_len+1):
                #截取内容
                one.append(article[i*article_max_len:(i+1)*article_max_len]+"")
            articles.append("\n".join(one)+"")
            if art_i%100==0:
                print('arti', art_i)
                # f1.write("\n\n".join(articles)+"\n\n")
                f1.write("\n\n".join(articles)+"")
                articles=[]
            # del articles
            del segs
        f1.write("\n\n".join(articles)+"")
        f1.close()
        gc.collect()
        del stemmer
        del summarizer
        del ie


        gc.collect()
        return
Beispiel #2
0
def scrape(request):
    if request.method == 'POST':

        y = json.loads(request.body)
        url = y.get("url", None)
        print(url)

        driver = webdriver.PhantomJS(
            executable_path='../phantomjs/bin/phantomjs')
        driver.get(url)
        el = driver.find_element_by_tag_name("body")
        textContent = el.text
        driver.close()

        imageSourceUrls = imageDB.objects.values_list('sourceUrl', flat=True)
        imageSourceUrls = list(imageSourceUrls)

        textSourceUrls = textDB.objects.values_list('sourceUrl', flat=True)
        textSourceUrls = list(textSourceUrls)

        summary = textContent
        if url not in textSourceUrls or url not in imageSourceUrls:
            LANGUAGE = "english"
            SENTENCES_COUNT = 10

            # parser = PlaintextParser.from_string(textContent,Tokenizer("english"))
            # summarizer = LuhnSummarizer()
            # summary = ''

            # for sentence in summarizer(parser.document, SENTENCES_COUNT):
            # 	summary = summary + str(sentence)

            # print("Summary ",summary)

            parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)
            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)

            summaryText = ""
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summaryText = summaryText + str(sentence)

            r = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            x = urlopen(r)
            codebase = BeautifulSoup(x, 'html.parser')
            title = codebase.title.string
            if not title:
                domain = urlparse(url)
                title = domain.hostname
            print(title)
            iconLink = codebase.find("link", rel="shortcut icon")
            if not iconLink:
                iconLink = ' '
            else:
                iconLink = urljoin(url, iconLink.get('href'))

            textDB.objects.create(summaryText=summaryText,
                                  summary=summary,
                                  dateTime=timezone.now(),
                                  sourceUrl=url,
                                  title=title,
                                  icon=iconLink)

            scraper = Scraper()
            scraper.scrape(url)
        else:
            textDB.objects.filter(sourceUrl=url).delete()
            imageDB.objects.filter(sourceUrl=url).delete()
            print("DELETED")
            LANGUAGE = "english"
            SENTENCES_COUNT = 10

            parser = PlaintextParser.from_string(textContent,
                                                 Tokenizer("english"))
            summarizer = LuhnSummarizer()

            summary = ''

            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summary = summary + str(sentence)

            parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)
            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)

            summaryText = ""
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summaryText = summaryText + str(sentence)

            r = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            x = urlopen(r)
            codebase = BeautifulSoup(x, 'html.parser')
            title = codebase.title.string
            iconLink = codebase.find("link", rel="shortcut icon")
            if not iconLink:
                iconLink = ' '
            else:
                iconLink = urljoin(url, iconLink.get('href'))

            textDB.objects.create(summaryText=summaryText,
                                  summary=summary,
                                  dateTime=timezone.now(),
                                  sourceUrl=url,
                                  title=title,
                                  icon=iconLink)

            scraper = Scraper()
            scraper.scrape(url)

        return HttpResponse("Successful")
 def __summarize(self, parser):
     summarizer = LsaSummarizer(Stemmer(self.__language))
     summarizer.stop_words = get_stop_words(self.__language)
     final_sentences = summarizer(parser.document, self.__sentences_count)
     return self.__join_sentences(final_sentences)
Beispiel #4
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 10
#nltk.download()
if __name__ == "__main__":
    url = "http://money.cnn.com/2015/12/01/investing/premarket-stocks-trading/index.html?iid=hp-stack-dom"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
def crawl():
    print(datetime.now())
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8mb4")
    cursor.execute(
        'select id, name, feedUrl, lang, form, content_rss from sources where mod(id, 30)=mod(%s, 30)',
        (datetime.now().minute, ))
    sources = cursor.fetchall()
    start = time.clock()
    for source in sources:
        # if source['id']%30 == datetime.now().minute%30:
        print(source[0])
        source = {
            'id': source[0],
            'name': source[1],
            'feedUrl': source[2].replace("188.131.178.76", "127.0.0.1"),
            'lang': source[3],
            'form': source[4],
            'content_rss': source[5]
        }
        print(source['name'])
        LANGUAGE = 'chinese'
        if source['lang'] == 2:
            LANGUAGE = 'english'
        items = feedparser.parse(source['feedUrl'])['items']
        for item in items:
            cursor.execute('select 1 from entries where link = %s limit 1',
                           (item['link'], ))
            results = cursor.fetchall()
            if (not results) or (len(results) == 0):
                entry = {
                    'title':
                    item['title'],
                    'link':
                    item['link'],
                    'source_id':
                    source['id'],
                    'source_name':
                    source['name'],
                    'time':
                    datetime.fromtimestamp(mktime(item['published_parsed'])) +
                    timedelta(hours=TZ_DELTA),
                    'crawl_time':
                    datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    'photo':
                    '',
                    'lang':
                    source['lang'],
                    'author':
                    item['author'],
                    'description':
                    '',
                    'digest':
                    '',
                    'content':
                    ''
                }
                if 'content' in item:
                    entry['content'] = item['content'][0]['value']
                if entry['content'] == '':
                    entry['content'] = item['summary']
                if entry['content'] != '':
                    entry['photo'] = getImg(entry['content'])
                if source['form'] == 1:
                    if source['content_rss'] == 1 and entry['content'] != '':
                        parser = HtmlParser.from_string(
                            entry['content'], "", Tokenizer(LANGUAGE))
                        stemmer = Stemmer(LANGUAGE)
                        summarizer = Summarizer(stemmer)
                        summarizer.stop_words = get_stop_words(LANGUAGE)
                        for sentence in summarizer(parser.document,
                                                   SENTENCES_COUNT):
                            entry['digest'] += str(sentence)
                            if len(entry['digest']) >= 500:
                                break

                    else:
                        parser = HtmlParser.from_url(entry['link'],
                                                     Tokenizer(LANGUAGE))
                        stemmer = Stemmer(LANGUAGE)
                        summarizer = Summarizer(stemmer)
                        summarizer.stop_words = get_stop_words(LANGUAGE)
                        for sentence in summarizer(parser.document,
                                                   SENTENCES_COUNT):
                            entry['digest'] += str(sentence)
                            if len(entry['digest']) >= 500:
                                break
                    entry['digest'] = entry['digest'][0:500]
                cursor.execute(add_entry, entry)
                conn.commit()
        # print(d['feed']['title'])
    elapsed = time.clock() - start
    print('time used: ' + str(elapsed))

    # 关闭Cursor和Connection:
    cursor.close()
Beispiel #6
0
import pickle
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


summarizer = LsaSummarizer(Stemmer('english'))
summarizer.stop_words = get_stop_words('english')

def lsa_summarizer(parser,sent_count):
   lsa_s = summarizer(parser.document, sent_count)
   summary_str = ""
   summary = []
   for sent in lsa_s:
       summary_str += str(sent)
   summary.append(summary_str)
   len_summary = 0
   for sent in summary:
       len_summary += len(sent.split())
   return summary, len_summary

def lsa_summaries (filepath,word_count):
   with open(filepath, 'rb') as filehandle:
       texts_str = pickle.load(filehandle)
   lsa_summary = []
   for t in texts_str:
       parser = PlaintextParser(t, Tokenizer('english'))
       for i in range (len(t.split('.'))):
           summary,len_summary = lsa_summarizer(parser,i)
Beispiel #7
0
    def __init__(self, input_texts: str):
        nltk.download('punkt')

        self.nlp = spacy.load('en_core_web_lg')

        self.summarizer = LsaSummarizer(Stemmer('english'))
        self.summarizer.stop_words = get_stop_words('english')

        self.cleaner = CleaningProcessor()

        self.synonyms: Dict[str, Optional[List[str]]] = {}
        if path.isfile('src/syns.yaml'):
            with open('src/syns.yaml', 'r') as f:
                self.synonyms = yaml.safe_load(f)

        if self.synonyms is None:
            self.synonyms = {}

        self.patterns: Dict[str, str] = OrderedDict()
        self.rev_patterns: Dict[str, str] = OrderedDict()

        with open('src/spreadr_shreddr/data.yaml', 'r') as f:
            data = yaml.safe_load(f)

        self.patterns.update(data['shorten'])
        self.patterns.update(data['expand'])

        data['filler'].extend(
            pycorpora.get_file('humans', 'prefixes')['prefixes'])

        self.patterns.update({k: '' for k in data['filler']})

        for obj in pycorpora.get_file('words', 'compounds')['compounds']:
            key = '{} {}'.format(obj['firstWord'], obj['secondWord'])
            if key not in self.patterns:
                self.patterns[key] = obj['compoundWord']

        self.patterns.update(
            {k.capitalize(): v.capitalize()
             for k, v in self.patterns.items()})

        self.brits = data['brit_am']
        self.murcans = {v: k for k, v in self.brits.items()}

        changed = False
        api = Datamuse()
        for text in input_texts:
            text >>= self.cleaner

            for sent in sent_tokenize(text):
                for index, word in enumerate(self.nlp(sent)):
                    orth = word.orth_.lower()
                    key = self.separator.join((orth, word.tag_))

                    if key not in self.synonyms:
                        changed = True
                        syns: List[str] = []

                        if (word.pos_ in UNIVERSAL_TO_DATAMUSE
                                and len(wn.synsets(orth)) <= 1):
                            res = api.words(ml=orth)

                            if len(res) > 0:
                                syns = self._get_synonyms(
                                    ' '.join(sent), (index, word), res)

                        if len(syns) > 1:
                            self.synonyms[key] = syns
                        else:
                            self.synonyms[key] = None

                    if changed:
                        changed = False
                        with open('src/syns.yaml', 'a') as f:
                            f.write(yaml.dump({key: self.synonyms[key]}))
Beispiel #8
0
def test_empty_document():
    document = build_document()
    summarizer = TextRankSummarizer(Stemmer("english"))

    returned = summarizer(document, 10)
    assert len(returned) == 0
Beispiel #9
0
def summary(url, length, LANGUAGE):
    language = LANGUAGE.lower()
    e = str() #capture error

    article = Article(url)
    try:    
        article.download()
        print ('  successfully d/l')
        article.parse()
        raw_html = article.html
        image = article.top_image
        meta = article.meta_description
        text = article.text
    except Exception as e:
        print(e)
 
    if not text:
        print ('  using Readability')
        raw_text = Readability(raw_html, url)
        text = raw_text.content
        article.download(html=text)
        article.parse()
        text = article.text
    if not meta:
        meta = article.title
    meta = unescape(unescape(meta))
    meta = normalize('NFKD', meta)
    meta = meta.strip()
    image = image.replace('(', '\(')
    image = image.replace(')', '\)')
    image_des = '\n\n> [{0}]({1})'.format("**^pic**", image) if image else None  
   
    parser = PlaintextParser(text, Tokenizer(language)) 
    word_count = len(text.split())
    compression = 100
    extra_words = 0
            
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)        
    short = []
    line = str()
    
    if word_count >= 600:
        length = length + int(log(word_count/600))
    for sentence in summarizer(parser.document, length):
        if str(sentence).strip().lower() in meta.lower():
            extra_words = len(str(sentence).split())
            continue
        line = '>• {0}'.format(sentence)
        line = line.replace("`", "\'")
        line = line.replace("#", "\#")
        short.append(line)
       
    extract = '\n\n'.join(short)
    extract = extract + image_des if image_des else extract
    meta = meta.replace('#', '\#')
    if len(meta) > 400:
       lpoint = meta.rfind('.', 0, 400)
       if lpoint == -1:
           meta = meta[:(meta.rfind(' ', 0, 400))] + '...'
       else:
           meta = meta[:(meta.rfind('.', 0, 400))] + '...'
              
    try:
        compression = int(((extract.count(' ')+extra_words)/word_count)*100)
    except Exception as numerror:
        print(numerror)
    print('  from {0} words to {1} words ({2}%)'.format(word_count, len(extract.split()), compression))
    return (meta, extract, compression, e)
def crawl():
    print(datetime.now())
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8mb4")
    cursor.execute('select id, name, feedUrl, lang, form from sources')
    sources = cursor.fetchall()
    start = time.clock()
    for source in sources:
        # if source['id']%30 == datetime.now().minute%30:
        print(source[0])
        source = {
            'id': source[0],
            'name': source[1],
            'feedUrl': source[2].replace("39.105.127.55", "127.0.0.1"),
            'lang': source[3],
            'form': source[4]
        }
        print(source['name'])
        LANGUAGE = 'chinese'
        if source['lang'] == 2:
            LANGUAGE = 'english'
        items = feedparser.parse(source['feedUrl'])['items']
        for item in items:
            try:
                cursor.execute('select 1 from entries where link = %s limit 1',
                               (item['link'], ))
                results = cursor.fetchall()
                if (not results) or (len(results) == 0):
                    try:
                        entry = {
                            'title':
                            item['title'],
                            'link':
                            item['link'],
                            'source_id':
                            source['id'],
                            'source_name':
                            source['name'],
                            'time':
                            '',
                            'crawl_time':
                            datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                            'photo':
                            '',
                            'lang':
                            1,
                            'author':
                            '',
                            'description':
                            '',
                            'digest':
                            '',
                            'content':
                            '',
                            'cluster':
                            0,
                            'sim_count':
                            0,
                            'simhash':
                            '0',
                            'cate11':
                            '',
                            'cate12':
                            '',
                            'cate13':
                            '',
                            'cate21':
                            '',
                            'cate22':
                            '',
                            'cate23':
                            '',
                            'tag1':
                            '',
                            'tag2':
                            '',
                            'tag3':
                            '',
                            'tag4':
                            '',
                            'tag5':
                            '',
                            'video':
                            '',
                            'video_frame':
                            '',
                            'audio':
                            '',
                            'audio_frame':
                            ''
                        }
                        cate1 = ['', '', '']
                        cate2 = ['', '', '']
                        tag = ['', '', '', '', '']
                        ############ Additonal Settings for special sources ##############
                        if entry['source_name'] == 'Hacker News':
                            entry['link'] = item['comments']
                        ###########################

                        if is_en(entry['title']):
                            entry['lang'] = 2
                        if 'published_parsed' in item:
                            try:
                                entry['time'] = datetime.fromtimestamp(
                                    mktime(
                                        item['published_parsed'])) + timedelta(
                                            hours=TZ_DELTA)
                            except Exception as e:
                                entry['time'] = entry['crawl_time']
                                print('Exception when published_parsed: {}'.
                                      format(e))
                        else:
                            entry['time'] = entry['crawl_time']

                        if 'author' in item:
                            entry['author'] = item['author'][0:20]

                        if 'summary' in item:
                            entry['description'] = item['summary'][0:500]

                        if 'content' in item:
                            entry['content'] = item['content'][0]['value'][
                                0:15000]
                        if entry['content'] == '' and 'summary' in item and len(
                                item['summary']) > 0:
                            entry['content'] = item['summary'][0:15000]
                        for field in item['links']:
                            if field['type'] == 'audio/mpeg':
                                if field['href'].endswith('.mp3'):
                                    entry['audio'] = field['href']
                                if field['href'].endswith('.mp4'):
                                    entry['video'] = field['href']

                        #对于文章类entry才进行摘要、聚类、分类、标签
                        if source['form'] == 1:
                            try:
                                if entry['content'] != '':
                                    entry['photo'] = getImg(entry['content'])
                                    if len(entry['photo']) > 255:
                                        entry['photo'] = ''

                                    parser = HtmlParser.from_string(
                                        entry['content'], "",
                                        Tokenizer(LANGUAGE))
                                    stemmer = Stemmer(LANGUAGE)
                                    summarizer = Summarizer(stemmer)
                                    summarizer.stop_words = get_stop_words(
                                        LANGUAGE)
                                    for sentence in summarizer(
                                            parser.document, SENTENCES_COUNT):
                                        entry['digest'] += str(sentence)
                                        if len(entry['digest']) >= 500:
                                            break
                                else:
                                    parser = HtmlParser.from_url(
                                        entry['link'], Tokenizer(LANGUAGE))
                                    stemmer = Stemmer(LANGUAGE)
                                    summarizer = Summarizer(stemmer)
                                    summarizer.stop_words = get_stop_words(
                                        LANGUAGE)
                                    for sentence in summarizer(
                                            parser.document, SENTENCES_COUNT):
                                        entry['digest'] += str(sentence)
                                        if len(entry['digest']) >= 500:
                                            break
                                entry['digest'] = entry['digest'][0:500]
                            except Exception as e:
                                print(
                                    'Exception when getting digest: {}'.format(
                                        e))

                            features = get_features(entry['title'],
                                                    entry['content'])
                            try:
                                entry['simhash'] = str(Simhash(features).value)
                                nears = index.get_near_dups(Simhash(features))
                                if len(nears) > 0:
                                    entry['sim_count'] = len(nears)
                                    cursor.execute(
                                        'select cluster from entries where id = %s',
                                        (int(nears[0]), ))
                                    near_cluster = cursor.fetchone()[0]
                                    entry['cluster'] = near_cluster
                                else:
                                    global last_cluster_num
                                    entry['cluster'] = last_cluster_num
                                    last_cluster_num += 1
                            except Exception as e:
                                print(
                                    'Exception when clustering: {}'.format(e))

                            try:
                                content2 = BeautifulSoup(
                                    entry['content'], "lxml").text.encode(
                                        'gbk', 'ignore').decode(
                                            'gbk')[0:AIP_MAX_LEN_CONTENT]
                                if len(content2) == 0:
                                    if len(entry['digest']) > 0:
                                        content2 = entry['digest']
                                title2 = entry['title'][0:AIP_MAX_LEN_TITLE]
                                keywords = client.keyword(title2, content2)
                                topics = client.topic(title2, content2)
                                i = 0
                                for item in topics['item']['lv1_tag_list']:
                                    cate1[i] = item['tag']
                                    i += 1
                                    if i > 2:
                                        break
                                i = 0
                                for item in topics['item']['lv2_tag_list']:
                                    cate2[i] = item['tag']
                                    i += 1
                                    if i > 2:
                                        break
                                i = 0
                                for item in keywords['items']:
                                    tag[i] = item['tag']
                                    i += 1
                                    if i > 4:
                                        break
                                entry['cate11'] = cate1[0]
                                entry['cate12'] = cate1[1]
                                entry['cate13'] = cate1[2]
                                entry['cate21'] = cate2[0]
                                entry['cate22'] = cate2[1]
                                entry['cate23'] = cate2[2]
                                entry['tag1'] = tag[0]
                                entry['tag2'] = tag[1]
                                entry['tag3'] = tag[2]
                                entry['tag4'] = tag[3]
                                entry['tag5'] = tag[4]
                            except Exception as e:
                                print(
                                    'Exception when categorizing and tagging: {}'
                                    .format(e))

                        elif source['form'] == 2:
                            entry['photo'] = getWeiboImg(entry['content'])
                            entry['digest'] = filterWeiboTags(entry['content'])
                            if len(entry['digest']) > 500:
                                entry['digest'] = entry['digest'][0:500]

                        elif source['form'] == 4:
                            if entry['link'].startswith(
                                    'https://www.bilibili.com/video'):
                                entry['video_frame'] = 'http://player.bilibili.com/player.html?aid=' + \
                                    entry['link'][33:]

                        try:
                            cursor.execute(add_entry, entry)
                            conn.commit()
                            index.add(str(cursor.lastrowid), Simhash(features))
                        except Exception as e:
                            print('Exception when add entry: {}'.format(e))
                    except Exception as e:
                        print("Unexpected Error: {}".format(e))
            except Exception as e:
                print("Unexpected Error: {}".format(e))
        # print(d['feed']['title'])
    elapsed = time.clock() - start
    print('time used: ' + str(elapsed))

    # 关闭Cursor和Connection:
    cursor.close()
Beispiel #11
0
def main(argv):
    # Leer parametros archivo entrada y directorio salida
    try:
        opts, args = getopt.getopt(argv, "i:o:",
                                   ["inputFile=", "outputDirectory="])
    except getopt.GetoptError:
        print('main.py -i <inputFile> -o <outputDirectory>')
        sys.exit(2)

    if (len(opts) != 2):
        print('main.py -i <inputFile> -o <outputDirectory>')
        sys.exit(2)

    PDF_SummaryDir = ''
    sourcePDFFile = ''

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -i <inputFile> -o <outputDirectory>')
            sys.exit()
        elif opt in ("-i", "--inputFile"):
            sourcePDFFile = arg
            if os.path.exists(sourcePDFFile):
                print('[+] Archivo PDF encontrado')
        elif opt in ("-o", "--outputDirectory"):
            PDF_SummaryDir = arg
            #Check if the directory PDF_summary exists or not
            if not os.path.exists(PDF_SummaryDir):
                os.makedirs(PDF_SummaryDir)
                print('[+] Directorio creado')

    #Set parameters
    languages = ['spanish', 'english']
    print('Seleccionar lenguaje')
    LANGUAGE = languages[cutie.select(languages)]
    print('[+] Lenguaje seleccionado')
    SENTENCES_COUNT = 30

    algoritmos = ['Luhn', 'Lsa', 'LexRank', 'TextRank', 'SumBasic', 'KLsum']
    print('Seleccionar algoritmo')
    chooseAlgo = algoritmos[cutie.select(algoritmos)]

    #create directories for output files
    outputPDFDir = os.path.dirname(PDF_SummaryDir + '/pdf/pdf_split_files/')
    if not os.path.exists(outputPDFDir):
        os.makedirs(PDF_SummaryDir + '/pdf/pdf_split_files/')

    outputTXTDir = os.path.dirname(PDF_SummaryDir + '/Text_Files/')
    if not os.path.exists(outputTXTDir):
        os.makedirs(PDF_SummaryDir + '/Text_Files/')

    outputSummaryDir = os.path.dirname(PDF_SummaryDir + '/Summary/')
    if not os.path.exists(outputSummaryDir):
        os.makedirs(PDF_SummaryDir + '/Summary/')

    #Name prefix for split files
    outputNamePrefix = 'Split_Chapter_'
    timeSuffixSummary = str(time.strftime("%d-%m-%Y_%H.%M.%S"))
    targetPDFFile = 'temppdfsplitfile.pdf'  # Temporary file

    # Append backslash to output dir ofor pdf if necessary
    if not outputPDFDir.endswith('/'):
        outputPDFDir = outputPDFDir + '/'

    # Append backslash to output dir for txt if necessary
    if not outputTXTDir.endswith('/'):
        outputTXTDir = outputTXTDir + '/'

    # Append backslash to output dir ofor pdf if necessary
    if not outputSummaryDir.endswith('/'):
        outputSummaryDir = outputSummaryDir + '/'

    #Check and Verify if PDF is ready for splitting
    while not os.path.exists(sourcePDFFile):
        print('Source PDF not found, sleeping...')
        #Sleep
        time.sleep(10)

    if os.path.exists(sourcePDFFile):
        #print('Found source PDF file')
        #Copy file to local working directory
        shutil.copy(sourcePDFFile, targetPDFFile)

        #Process file
        #Create object and Open File in Read Binary Mode
        pdfFileObj2 = open(targetPDFFile, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj2)
        pdfFileObj = BookmarkToPageMap(pdfFileObj2)

        #Get total pages
        numberOfPages = pdfReader.numPages

        i = 0
        newPageNum = 0
        prevPageNum = 0
        newPageName = ''
        prevPageName = ''

        for p, t in sorted([
            (v, k) for k, v in pdfFileObj.getDestinationPageNumbers().items()
        ]):
            template = '%-5s  %s'
            #   To Check Page number and Title of the Chapter Uncomment the following lines
            ##  print (template % ('Page', 'Title'))
            ##  print (template % (p+1,t))

            newPageNum = p + 1
            newPageName = t

            if prevPageNum == 0 and prevPageName == '':
                #  First Page
                prevPageNum = newPageNum
                prevPageName = newPageName
            else:
                # Next Page
                pdfWriter = PyPDF2.PdfFileWriter()
                page_idx = 0
                for i in range(prevPageNum, newPageNum):
                    pdfPage = pdfReader.getPage(i - 1)
                    pdfWriter.insertPage(pdfPage, page_idx)
                    #   Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i))
                    page_idx += 1

            #   Creating names of split files
                pdfFileName = str(outputNamePrefix + prevPageName +
                                  '.pdf').replace(':', '_').replace('*', '_')
                txtFileName = str(outputNamePrefix + prevPageName +
                                  '.txt').replace(':', '_').replace('*', '_')

                #   Writing each chapter to the .pdf file
                pdfOutputFile = open(outputPDFDir + pdfFileName, 'wb')
                pdfWriter.write(pdfOutputFile)
                pdfOutputFile.close()

                #   Check : print('Created PDF file: ' + outputPDFDir + pdfFileName)

                #   Calling convert function and writing each chapter to the .txt file
                txtOutputFile = open(outputTXTDir + txtFileName, 'w')
                txtOutputFile.write(convert(outputPDFDir + pdfFileName))
                txtOutputFile.close()
                #   Check :print('Created TXT file: ' + outputTXTDir + txtFileName)

                #   for plain text files create Summary
                parser = PlaintextParser.from_file(outputTXTDir + txtFileName,
                                                   Tokenizer(LANGUAGE))
                stemmer = Stemmer(LANGUAGE)
                #   Using LsaSummarizer to create summary
                ##  Select from different algorithms to create summary by using different algorithms
                if chooseAlgo == 'Lsa':
                    summarizer = Lsa(stemmer)
                elif chooseAlgo == 'LexRank':
                    summarizer = LexRank(stemmer)
                elif chooseAlgo == 'TextRank':
                    summarizer = TextRank(stemmer)
                elif chooseAlgo == 'Luhn':
                    summarizer = Luhn(stemmer)
                elif chooseAlgo == 'SumBasic':
                    summarizer = SumBasic(stemmer)
                elif chooseAlgo == 'KLsum':
                    summarizer = KLsum(stemmer)
                else:
                    print('Wrong Algorithm selected.')
                    sys.exit(0)

                summarizer.stop_words = get_stop_words(LANGUAGE)
                #   Open file in append mode so that summary will be added at the bottom of file
                summaryOutputFile = open(
                    outputSummaryDir + chooseAlgo + '_Summary_File' +
                    timeSuffixSummary + '.txt', 'a')
                for sentence in summarizer(parser.document, SENTENCES_COUNT):
                    #   Check : print (sentence)
                    summaryOutputFile.write(str(sentence))

            #   To create Separation between Chapters
                summaryOutputFile.write(
                    str('\n\n' + 'Title : ' + t + '\n' + '\t'))
                summaryOutputFile.close()

            i = prevPageNum
            prevPageNum = newPageNum
            prevPageName = newPageName

        # Split the last page
        pdfWriter = PyPDF2.PdfFileWriter()
        page_idx = 0
        for i in range(prevPageNum, numberOfPages + 1):
            pdfPage = pdfReader.getPage(i - 1)
            pdfWriter.insertPage(pdfPage, page_idx)
            #   Check : print('Added page to PDF file: ' + prevPageName + ' - Page #: ' + str(i))
            page_idx += 1

        pdfFileObj2.close()
        print('[+] Archivo creado: ' + outputSummaryDir + 'SummaryFile.txt')

    # Delete temp file
    os.unlink(targetPDFFile)
def analyze(parser):
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Beispiel #13
0
# instantiate Slack client
slack_client = SlackClient(os.environ.get('SLACK_BOT_TOKEN'))
# starterbot's user ID in Slack: value is assigned after the bot starts up
starterbot_id = None

# constants
RTM_READ_DELAY = 1  # 1 second delay between reading from RTM
EXAMPLE_COMMAND = "do"
MENTION_REGEX = "^<@(|[WU].+?)>(.*)"
ARXIV_REGEX = r'(https?://[^\s]+[0-9]+)'
USER_HANDLE = 'danfei'  # your username on slack

# for sumy
lang = 'english'
tknz = Tokenizer(lang)
stemmer = Stemmer(lang)
summarizer = Summarizer(stemmer)


def summarize(string, num_sentence=3):
    """
    Summarize a sentence with sumy
    """
    parser = PlaintextParser(string, tknz)
    parser.stop_word = get_stop_words(lang)
    summ_string = ''
    for sentence in summarizer(parser.document, num_sentence):
        summ_string += str(sentence) + ' '
    return summ_string

Beispiel #14
0
def process_data(text):
    text_data = unidecode.unidecode(text)
    clean_list, pure_list = prepare_for_regex(text_data)

    data_to_summarize = []
    for clean, pure in zip(clean_list, pure_list):
        if re.findall(clause, clean):
            data_to_summarize.append(pure)
    text_data = " ".join(data_to_summarize)
    parser = PlaintextParser(text_data, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = TextRankSummarizer(stemmer)

    summary = summarizer(parser.document, SENTENCES_COUNT)
    sentences = []
    for sentence in summary:
        skip = False
        for punct in ["[", "]", "{", "}", "=", "+", "_", "|", "<", ">", "^"]:
            if punct in str(sentence):
                skip = True
        if skip:
            continue
        if str(sentence)[-1] == "." and len(str(sentence)) < 500:
            try:
                int(str(sentence)[0])
                sentence = str(sentence)[1:].strip()
            except:
                sentence = str(sentence).strip()
            sent = nlp(sentence)
            sentence = ""
            entities = []
            for token in sent:
                if sentence and token.text.strip() not in string.punctuation:
                    sentence += " "
                if token.text.lower() in VERBS:
                    sentence += '''<mark class="entity" style="background: #ffffb3; padding: 0.2em 0.2em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">{}</mark>'''.format(
                        token.text)
                    entities.append(token.text)
                else:
                    sentence += token.text
            sentence = sentence[:-1] + ". "
            for ent in sent.ents:
                if ent.text == "IP":
                    continue
                if ent.text not in entities and ent.label_ == "ORG" or ent.text.lower(
                ) == "stripe":
                    sentence = sentence.replace(
                        ent.text,
                        '''<mark class="entity" style="background: #ccffcc; padding: 0.2em 0.2em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">{}</mark> '''
                        .format(ent.text))
                    entities.append(ent.text)
            for term in KEYWORDS:
                if term in entities:
                    continue
                case_insensitive = re.compile(re.escape(term), re.IGNORECASE)
                sentence = case_insensitive.sub(
                    '''<mark class="entity" style="background: #b3d9ff; padding: 0.2em 0.2em;; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">{}</mark>'''
                    .format(term), sentence)
            sentences.append({
                "text": sentence,
                "rating": get_sentiment(sentence)
            })
    readability_score, complexity_score = get_readability(text_data)
    return jsonify({
        "summary_sentences": sentences,
        "readability_score": readability_score,
        "complexity_score": complexity_score
    })
Beispiel #15
0
def tool(request, tool_name):
    tool = get_object_or_404(models.Tool, url_endpoint__iexact=tool_name)
    context = {'tool': tool}
    category = tool.category.replace(" ", "_").lower()
    print(tool.template_name, "   template   ", sep="    ")
    print('tools/{0}/{1}'.format(category, tool.template_name))
    if (request.method == 'POST' and tool_name == "resume_builder"):
        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=bool, margin=5)
        name = request.POST.get('asliname')
        pdf.set_top_margin(margin=5)
        pdf.set_font('Arial', 'B', 35)
        pdf.multi_cell(w=0, h=13, txt=name.strip().rstrip("\n\r"), align='C')
        address = request.POST.get('address')
        email = request.POST.get('email')
        phone = request.POST.get('phone')
        about = request.POST.get('about')
        pdf.set_font('Times', '', 16)
        pdf.multi_cell(w=0, h=8, txt=address.strip().rstrip("\n\r"), align='C')
        pdf.multi_cell(w=0, h=8, txt=email.strip().rstrip("\n\r"), align='C')
        pdf.multi_cell(w=0, h=8, txt=phone.strip().rstrip("\n\r"), align='C')
        pdf.multi_cell(w=0, h=10, txt=about.strip().rstrip("\n\r"), align='C')

        #pdf.line(5,55,205,55)
        pdf.multi_cell(w=0, h=9, txt=" ", align='C')

        #All Correctly Retrieving
        pdf.set_font('Arial', 'B', 20)
        pdf.multi_cell(w=0, h=10, txt="Profile", align='L')
        profile = request.POST.get('totalprofile')
        profile = int(profile)  #Total
        pro1 = request.POST.get('temp')
        pdf.set_font('Times', '', 15)
        #if ( type(pro1) == 'str' ):
        pdf.multi_cell(w=0, h=7, txt=pro1.strip().rstrip("\n\r"), align='L')
        #print(pro1," PROFILE ",sep="  ")
        lineheight = 81
        for i in range(1, profile):
            temp = "temp" + str(i)
            rest = request.POST.get(temp)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            #print(rest," PROFILE ",sep="  ")

        #pdf.line(5,lineheight,205,lineheight)
        pdf.multi_cell(w=0, h=9, txt=" ", align='C')

        #All Correctly Retrieving
        pdf.set_font('Arial', 'B', 20)
        pdf.multi_cell(w=0, h=10, txt="Work Experience", align='L')
        work = request.POST.get('totalwork')
        work = int(work)  #Total
        com1 = request.POST.get("companyplate2")
        pos1 = request.POST.get("positionplate2")
        dur1 = request.POST.get("durationplate2")
        wor1 = request.POST.get("workdoneplate2")
        #if ( type(com1) == 'str' ):
        pdf.set_font('Times', 'B', 15)
        pdf.multi_cell(w=0, h=7, txt=com1.strip().rstrip("\n\r"), align='L')
        pdf.set_font('Times', 'B', 15)
        pdf.multi_cell(w=0, h=7, txt=pos1.strip().rstrip("\n\r"), align='L')
        pdf.set_font('Times', '', 15)
        pdf.multi_cell(w=0, h=7, txt=dur1.strip().rstrip("\n\r"), align='L')
        pdf.multi_cell(w=0, h=7, txt=wor1.strip().rstrip("\n\r"), align='L')
        #print(com1,pos1,dur1,wor1,sep="   ")
        lineheight = lineheight + 47
        pdf.multi_cell(w=0, h=3, txt=" ", align='C')
        for i in range(1, work):
            comp = "companyplate2" + str(i)
            rest1 = request.POST.get(comp)
            pos = "positionplate2" + str(i)
            rest2 = request.POST.get(pos)
            dur = "durationplate2" + str(i)
            rest3 = request.POST.get(dur)
            wor = "workdoneplate2" + str(i)
            rest4 = request.POST.get(wor)
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest1.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest2.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', '', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest3.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest4.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0, h=3, txt=" ", align='C')
            #print(rest1,rest2,rest3,rest4,sep="   ")

        #pdf.line(5,lineheight,205,lineheight)
        pdf.multi_cell(w=0, h=9, txt=" ", align='C')

        #All Correctly Retrieving
        pdf.set_font('Arial', 'B', 20)
        pdf.multi_cell(w=0, h=10, txt="Education", align='L')
        edu = request.POST.get('totaledu')
        edu = int(edu)  #Total
        ini1 = request.POST.get("institutiontemplate3")
        score1 = request.POST.get("scoretemplate3")
        edudur1 = request.POST.get("duration2template3")
        #if ( type(ini1) == 'str' ):
        pdf.set_font('Times', 'B', 15)
        pdf.multi_cell(w=0, h=7, txt=ini1.strip().rstrip("\n\r"), align='L')
        pdf.set_font('Times', 'B', 15)
        pdf.multi_cell(w=0, h=7, txt=edudur1.strip().rstrip("\n\r"), align='L')
        pdf.set_font('Times', '', 15)
        pdf.multi_cell(w=0, h=7, txt=score1.strip().rstrip("\n\r"), align='L')
        #print(ini1,score1,edudur1,sep="   ")
        lineheight = lineheight + 40
        pdf.multi_cell(w=0, h=3, txt=" ", align='C')
        for i in range(1, edu):
            ini = "institutiontemplate3" + str(i)
            rest1 = request.POST.get(ini)
            score = "scoretemplate3" + str(i)
            rest2 = request.POST.get(score)
            edudur = "duration2template3" + str(i)
            rest3 = request.POST.get(edudur)
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest1.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest3.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', '', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest2.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0, h=3, txt=" ", align='C')
            #print(rest1,rest2,rest3,sep="   ")

        #pdf.line(5,lineheight,205,lineheight)
        pdf.multi_cell(w=0, h=9, txt=" ", align='C')

        #All Correctly Retrieving
        pdf.set_font('Arial', 'B', 20)
        pdf.multi_cell(w=0, h=10, txt="Projects", align='L')
        proj = request.POST.get('totalproj')
        proj = int(proj)  #Total
        project1 = request.POST.get("projectlate2")
        tech1 = request.POST.get("techlate2")
        projdur1 = request.POST.get("projectdurationlate2")
        projwor1 = request.POST.get("projectdonelate2")
        #if ( proj >= 1 ):
        pdf.set_font('Times', 'B', 15)
        pdf.multi_cell(w=0,
                       h=7,
                       txt=project1.strip().rstrip("\n\r"),
                       align='L')
        pdf.set_font('Times', 'B', 15)
        pdf.multi_cell(w=0, h=7, txt=tech1.strip().rstrip("\n\r"), align='L')
        pdf.set_font('Times', '', 15)
        pdf.multi_cell(w=0,
                       h=7,
                       txt=projdur1.strip().rstrip("\n\r"),
                       align='L')
        pdf.multi_cell(w=0,
                       h=7,
                       txt=projwor1.strip().rstrip("\n\r"),
                       align='L')
        lineheight = lineheight + 47
        pdf.multi_cell(w=0, h=3, txt=" ", align='C')
        #print(project1,tech1,projdur1,projwor1,sep="   ")
        for i in range(1, proj):
            project = "projectlate2" + str(i)
            rest1 = request.POST.get(project)
            tech = "techlate2" + str(i)
            rest2 = request.POST.get(tech)
            projdur = "projectdurationlate2" + str(i)
            rest3 = request.POST.get(projdur)
            projwor = "projectdonelate2" + str(i)
            rest4 = request.POST.get(projwor)
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest1.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest2.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', '', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest3.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest4.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0, h=3, txt=" ", align='C')
            #print(rest1,rest2,rest3,rest4,sep="   ")

        #pdf.line(5,lineheight,205,lineheight)
        pdf.multi_cell(w=0, h=9, txt=" ", align='C')

        #All Correctly Retrieving
        pdf.set_font('Arial', 'B', 20)
        pdf.multi_cell(w=0, h=10, txt="Skills", align='L')
        skill = request.POST.get('totalskill')
        skill = int(skill)  #Total
        skill1 = request.POST.get("skillkill")
        diffi1 = request.POST.get("kill")
        print(skill)
        if (diffi1 == 'Intermediate' or diffi1 == 'Advance'
                or diffi1 == 'Beginner'):
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=skill1.strip().rstrip("\n\r"),
                           align='L')
            pdf.set_font('Times', '', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=diffi1.strip().rstrip("\n\r"),
                           align='L')
        lineheight = lineheight + 33
        pdf.multi_cell(w=0, h=3, txt=" ", align='C')
        #print(skill1,diffi1,sep="   ")
        for i in range(1, skill):
            skill = "skillkill" + str(i)
            rest1 = request.POST.get(skill)
            diffi = "kill" + str(i)
            rest2 = request.POST.get(diffi)
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest1.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', '', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest2.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0, h=3, txt=" ", align='C')
            #print(rest1,rest2,sep="   ")

        #pdf.line(5,lineheight,205,lineheight)
        pdf.multi_cell(w=0, h=9, txt=" ", align='C')

        #All Correctly Retrieving
        pdf.set_font('Arial', 'B', 20)
        pdf.multi_cell(w=0, h=10, txt="Awards", align='L')
        award = request.POST.get('totalaward')
        award = int(award)  #Total
        achieve1 = request.POST.get("achievementward")
        awarddone1 = request.POST.get("awarddoneward")
        #if ( type(achieve1) == 'str' ):
        pdf.set_font('Times', 'B', 15)
        pdf.multi_cell(w=0,
                       h=7,
                       txt=achieve1.strip().rstrip("\n\r"),
                       align='L')
        pdf.set_font('Times', '', 15)
        pdf.multi_cell(w=0,
                       h=7,
                       txt=awarddone1.strip().rstrip("\n\r"),
                       align='L')
        lineheight = lineheight + 33
        pdf.multi_cell(w=0, h=3, txt=" ", align='C')
        #print(achieve1,awarddone1,sep="   ")
        for i in range(1, award):
            achieve = "achievementward" + str(i)
            rest1 = request.POST.get(achieve)
            awarddone = "awarddoneward" + str(i)
            rest2 = request.POST.get(awarddone)
            pdf.set_font('Times', 'B', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest1.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.set_font('Times', '', 15)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest2.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0, h=3, txt=" ", align='C')
            #print(rest1,rest2,sep="   ")

        #pdf.line(5,lineheight,205,lineheight)
        pdf.multi_cell(w=0, h=9, txt=" ", align='C')

        #All Correctly Retrieving
        pdf.set_font('Arial', 'B', 20)
        pdf.multi_cell(w=0, h=10, txt="Interests", align='L')
        inter = request.POST.get('totalinter')
        inter = int(inter)  #Total
        interest1 = request.POST.get("inter")
        pdf.set_font('Times', '', 15)
        #if ( type(interest1) == 'str' ):
        pdf.multi_cell(w=0,
                       h=7,
                       txt=interest1.strip().rstrip("\n\r"),
                       align='L')
        lineheight = lineheight + 26
        pdf.multi_cell(w=0, h=3, txt=" ", align='C')
        #print(interest1,sep="   ")
        for i in range(1, inter):
            interest = "rest" + str(i)
            rest1 = request.POST.get(interest)
            pdf.multi_cell(w=0,
                           h=7,
                           txt=rest1.strip().rstrip("\n\r"),
                           align='L')
            lineheight = lineheight + 7
            pdf.multi_cell(w=0, h=3, txt=" ", align='C')
            #print(rest1,sep="   ")

        dirspot = os.getcwd()
        print(dirspot + " DIRECTORY DEKHO ")
        pdf.output(dirspot + '/resume.pdf', 'F')
        filename = dirspot + '/resume.pdf'
        #return FileResponse(as_attachment=True, filename=dirspot+'/resume.pdf')
        return FileResponse(open(filename, 'rb'),
                            content_type='application/pdf')
    if (request.method == "POST" and tool_name == "text_summary"):
        print("   POST DEKHO ", tool_name, sep="  ")
        inp = request.POST.get('input')
        aslinp = inp
        lang = request.POST.get('Languages')
        algo = request.POST.get('Algorithm')
        percen = request.POST.get('percentage')
        parser = PlaintextParser.from_string(inp, Tokenizer(lang))
        stemmer = Stemmer(lang)
        if (algo == "Edmundson"):
            summarizer = Summarizer0(stemmer)
            summarizer.stop_words = get_stop_words(lang)
        elif (algo == "Latent Semantic Analysis"):
            summarizer = Summarizer1(stemmer)
            summarizer.stop_words = get_stop_words(lang)
        elif (algo == "LexRank"):
            summarizer = Summarizer2(stemmer)
            summarizer.stop_words = get_stop_words(lang)
        elif (algo == "TextRank"):
            summarizer = Summarizer3(stemmer)
            summarizer.stop_words = get_stop_words(lang)
        elif (algo == "Luhn"):
            summarizer = Summarizer4(stemmer)
            summarizer.stop_words = get_stop_words(lang)
        elif (algo == "SumBasic"):
            summarizer = Summarizer5(stemmer)
            summarizer.stop_words = get_stop_words(lang)
        elif (algo == "KL-Sum"):
            summarizer = Summarizer6(stemmer)
            summarizer.stop_words = get_stop_words(lang)
        summary = []
        for sentence in summarizer(parser.document, percen):
            summary.append(sentence)
        result = ''.join(str(v) for v in summary)
        print(result)
        context = {'tool': tool, 'summary': result, 'asliinp': aslinp}
        return render(request, 'tools/text_tools/text_summary.html', context)
    else:
        return render(request, 'tools/{0}/{1}'.format(category,
                                                      tool.template_name),
                      context)
Beispiel #16
0
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import hebrew_tokenizer as ht
import requests
import string
# for removing punctuations from text
translator = str.maketrans("", "", string.punctuation)

# Increase this variable to get more summary and reduce this to get less summary
SENTENCES_COUNT = 3

if __name__ == "__main__":
    # url in hebrew language
    url = "https://he.wikipedia.org/wiki/%D7%93%D7%95%D7%A0%D7%9C%D7%93_%D7%98%D7%A8%D7%90%D7%9E%D7%A4"
    parser = HtmlParser.from_url(url, None)

    stemmer = Stemmer("Hebrew")
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words("Hebrew")

    # Must set the encoding to [utf-8] for hebrew language
    file = open("output.txt", "w", encoding='utf-8')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        # printing the text from the sentence object
        print(sentence._text)
        # writing the result into the file as well
        file.write(sentence._text)
    file.close()
Beispiel #17
0
def text_summary():
    def set_stopwords(stopwords):
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

    def sentence_segment(doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences

    def get_vocab(sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab

    def get_token_pairs(window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i + 1, i + window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs

    def symmetrize(a):
        return a + a.T - np.diag(a.diagonal())

    def get_matrix(vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1

        # Get Symmeric matrix
        g = symmetrize(g)
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(
            g, norm, where=norm != 0)  # this is ignore the 0 element in norm
        return g_norm

    def analyze(text,
                candidate_pos=['NOUN', 'PROPN', 'VERB'],
                window_size=4,
                lower=False,
                stopwords=list(),
                number=10):
        """Main function to analyze text"""

        # Set stop words
        set_stopwords(stopwords)
        # Pare text by spaCy
        doc = nlp(text)
        # Filter sentences
        sentences = sentence_segment(doc, candidate_pos,
                                     lower)  # list of list of words
        # Build vocabulary
        vocab = get_vocab(sentences)
        # Get token_pairs from windows
        token_pairs = get_token_pairs(window_size, sentences)
        # Get normalized matrix
        g = get_matrix(vocab, token_pairs)
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        # Iteration
        previous_pr = 0
        for epoch in range(steps):
            pr = (1 - d) + d * np.dot(g, pr)
            if abs(previous_pr - sum(pr)) < min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        node_weight = node_weight
        node_weight = OrderedDict(
            sorted(node_weight.items(), key=lambda t: t[1], reverse=True))
        keyword = []
        for i, (key, value) in enumerate(node_weight.items()):
            keyword.append(key)
            #print(key + ' - ' + str(value))
            if i > number:
                break
        return keyword

    def command_detected(sentence):
        # Detects whether a given String sentence is a command or action-item
        tagged_sentence = pos_tag(word_tokenize(sentence))
        first_word = tagged_sentence[0]
        pos_first = first_word[1]
        first_word = first_word[0].lower()
        for word in prohibited_command_words:
            if word in sentence:
                return False
        for word in command_words:
            if word in sentence:
                return True
        # Checks whether the first sentence is a Modal Verb or other type of Verb that is not a gerund
        if (pos_first == "VB" or pos_first == "VBZ"
                or pos_first == "VBP") and first_word[-3:] != "ing":
            return True
        return False

    def retrieve_action_items():
        # Returns a list of the sentences containing action items.
        action_items = []
        for sentence in tokenized_transcript:
            possible_command = command_detected(str(sentence))
            if possible_command is True:
                action_items += [(str(sentence))]
        return action_items

    text = request.json
    text = text['data'].replace('Speaker ', '')
    source = re.sub(r'\d\s+\d{1,2}\:\d{2}', '', text)
    source = re.sub(r'\s+', ' ', source)

    Keywords = analyze(source,
                       candidate_pos=['NOUN', 'PROPN', 'VERB'],
                       window_size=4,
                       lower=False)

    tokenized_transcript = sent_tokenize(source)
    LANGUAGE = "English"
    parser = PlaintextParser.from_string(source, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summary = summarizer(parser.document, len(tokenized_transcript) * 0.05)
    transcript_summary = []
    for sentence in summary:
        transcript_summary.append(str(sentence))

    command_words = [
        "can you", "would you", "can we", "you should", "we should",
        "we need to", "you need to", "ensure", "make sure", "make it",
        "we want to", "we must", "you must", "you have to", "we have to"
        "homework"
    ]
    prohibited_command_words = ["Let me", "?"]
    Action_item = retrieve_action_items()

    result = {
        "keywords :": Keywords,
        'Summary :': transcript_summary,
        'Action Items :': Action_item
    }
    return jsonify(result)
Beispiel #18
0
def summary(article_url):
    url = article_url
    #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato"
    # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage"
    # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident"
    # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # create a list of reference sentences to calculate ROUGE_N scores
    ref_sentences = []
    trim_ref_sentences = []
    for paragraph in parser._article.main_text:
        for sections in paragraph:
            for sentences in sections:
                try:
                    if len(sentences) > 35:
                        # trim off super short - likely a few word sentences
                        ref_sentences.append(sentences)
                except TypeError:
                    # catch type errors caused by annotated text ie h1, b, etc
                    print("typeError")
                    continue
    trim_ref_sentences.extend(
        Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences)

    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    # define summarizers for the summarizing methods being used
    summarizer_Lsa = Lsa(stemmer)
    summarizer_Lsa.stop_words = get_stop_words(LANGUAGE)
    summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT)

    summarizer_LexRank = LexRank()
    summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT)

    summarizer_Edmundson = Edmundson(stemmer)
    summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
    summarizer_Edmundson.bonus_words = parser.significant_words
    summarizer_Edmundson.stigma_words = parser.stigma_words
    summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT)

    # print summaries
    summary_Lsa_trim = []
    for sentence in summary_Lsa:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Lsa_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Lsa_trim, trim_ref_sentences)

    print('\n')
    summary_LexRank_trim = []
    for sentence in summary_LexRank:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_LexRank_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_LexRank_trim, trim_ref_sentences)

    print('\n')
    summary_Edmundson_trim = []
    for sentence in summary_Edmundson:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Edmundson_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Edmundson_trim, trim_ref_sentences)

    # returns index of max 0=Ed, 1=Lsa, 2=Lex
    models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"}
    best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim,
                               summary_Edmundson_trim, trim_ref_sentences)
    print(
        models.get(best_summary) +
        ' is the best model according to an average of the Rouge_3, 2 and 1 tests'
    )

    #return the summary of the best model
    if (best_summary == 0):
        return summary_Edmundson_trim
    elif (best_summary == 1):
        return summary_Lsa_trim
    elif (best_summary == 2):
        return summary_LexRank_trim
    def test_empty_document(self):
        document = build_document()
        summarizer = TextRankSummarizer(Stemmer("english"))

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 0)
Beispiel #20
0
 def test_english_stemmer(self):
     english_stemmer = Stemmer('english')
     self.assertEqual("beauti", english_stemmer("beautiful"))
Beispiel #21
0
    with open(reference_filename) as fref:
        refs = fref.read()
        extractive_references = ' '.join(sent_tokenize(refs)[0:5])
        abstractive_references = ' '.join(sent_tokenize(refs)[-4:])
        ex_reference_document = PlaintextParser.from_string(
            extractive_references, Tokenizer('english'))
        abs_reference_document = PlaintextParser.from_string(
            abstractive_references, Tokenizer('english'))
        ex_reference_sentences = ex_reference_document.document.sentences
        abs_reference_sentences = abs_reference_document.document.sentences

    # Read input file to be summarized
    comm_parser = PlaintextParser.from_file(input_filename,
                                            Tokenizer('english'))
    stemmer = Stemmer('english')

    # Get list of sentences in the original commentary
    orig_text = comm_parser.document.sentences

    # Open output file for writing
    fout = open(output_filename, 'w')

    # Make baseline summary
    baseline_summary = baseline(input_filename)
    fout.write("BASELINE: \n")
    fout.write(baseline_summary + "\n\n")
    print("Summarizing using Algorithm: Baseline \n")

    base_summary_sentences = PlaintextParser.from_string(
        baseline_summary, Tokenizer('english')).document.sentences
Beispiel #22
0
 def test_german_stemmer(self):
     german_stemmer = Stemmer('german')
     self.assertEqual("sterb", german_stemmer("sterben"))
                            fo.write(' negative_score: ' + str(negative_score))
                            fo.write('\n')

                            featureObs[featCount].summaryOp.append(line)

                            if (positive_score >= negative_score):
                                fw.write(line.rstrip() + ' -> Positive')
                                featureObs[featCount].positiveOp.append(line)
                            else:
                                fw.write(line.rstrip() + ' -> Negative')
                                featureObs[featCount].negativeOp.append(line)
                            fw.write('\n')
                featCount += 1
                fp.seek(0)

stemmer = Stemmer("english")
summarizer = Summarizer(stemmer)

with open('Feature_Review.txt', 'w') as fw:

    for i in range(100):
        fw.write('\n\n\n*****************************\n')
        fw.write('FEATURE: ' + featureObs[i].featureName +
                 ' POSITIVE OPINIONS: ' + str(len(featureObs[i].positiveOp)) +
                 ' NEGATIVE OPINIONS: ' + str(len(featureObs[i].negativeOp)))
        fw.write('\n*****************************\n')
        # fw.write('\n*******SUMMARY REVIEW*******\n')
        # parser = PlaintextParser(''.join(featureObs[i].summaryOp),Tokenizer("english"))
        # for sentence in summarizer(parser.document,50) :
        # 	fw.write(str(sentence))
        fw.write('\n*******POSITIVE REVIEWS*******\n')
Beispiel #24
0
 def test_czech_stemmer(self):
     czech_stemmer = Stemmer('czech')
     self.assertEqual("pěkn", czech_stemmer("pěkný"))
def gaz(type_df, time, cut, many):
    nlp = spacy.load('en')

    if cut == "True":
        type_df = type_df[type_df["Review Date"] > time]
    else:
        type_df = type_df[type_df["Review Date"] < time]

    sample_review = ""
    for i in type_df["review"]:
        sample_review = sample_review + " " + str(i)

    # print(sample_review)

    len(sample_review)

    sample_review = sample_review.replace("\\", "")

    #### Summary:

    ### Summaries
    import sumy

    from sumy.summarizers.lex_rank import LexRankSummarizer
    from sumy.summarizers.text_rank import TextRankSummarizer

    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    lexi = LexRankSummarizer(Stemmer("english"))
    texi = TextRankSummarizer(Stemmer("english"))

    parser = PlaintextParser.from_string(sample_review, Tokenizer("english"))

    texi = TextRankSummarizer(Stemmer("english"))

    rentence = "dddd"
    for sentence in texi(parser.document, 10):  # This does indeed summarise the document
        if (str(rentence).split()[len(str(rentence).split()) - 1][-1] == ".") and (len(rentence) > 2):
            rentence = rentence + " " + str(sentence)
        elif len(rentence) < 3:
            rentence = rentence + " " + str(sentence)
        else:
            rentence = rentence + ". " + str(sentence)

    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", '?', '!', '! !', ':', ';', '(', ')', '[', ']', '{',
                       '}'])  # remove it if you need punctuation

    list_of_words = [i.lower() for i in wordpunct_tokenize(sample_review) if i.lower() not in stop_words]

    final = ' '.join(list_of_words)

    from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+')
    list_of_words = tokenizer.tokenize(final)
    final = ' '.join(list_of_words)

    parsed_review = nlp(final)

    # print(parsed_review)

    token_text = [token.orth_ for token in parsed_review]
    token_pos = [token.pos_ for token in parsed_review]

    df = pd.DataFrame({'token_text': token_text, 'part_of_speech': token_pos})

    # Unigrams
    import nltk
    from nltk import word_tokenize
    from nltk.util import ngrams
    from collections import Counter

    token = nltk.word_tokenize(str(parsed_review))
    grams = ngrams(token, many)

    dra = Counter(grams)

    t = pd.DataFrame()

    f = pd.DataFrame(list(dra.keys()))

    if many == 2:
        f[0] = f[0] + " " + f[1]

    if many == 3:
        f[0] = f[0] + " " + f[1] + " " + f[2]

    f = f[0]

    t["name"] = f
    t["count"] = list(dra.values())

    df = df.drop_duplicates()
    r = pd.merge(t, df, left_on=["name"], right_on=["token_text"], how="left", right_index=False)
    r = r.drop("token_text", axis=1)
    r.columns = ["name", "count", "pos"]

    scaler = MinMaxScaler()
    r["norm"] = scaler.fit_transform(r["count"].values.reshape(-1, 1))

    if many == 1:
        dfs = r[r["pos"] == "NOUN"].sort_values("count", ascending=False)
    else:
        dfs = r.sort_values("count", ascending=False)

    return dfs, rentence
Beispiel #26
0
 def test_french_stemmer(self):
     french_stemmer = Stemmer('czech')
     self.assertEqual("jol", french_stemmer("jolies"))
Beispiel #27
0
    map(lambda x: os.path.join("../data/reviews", x), in_files))

test_input = "hahahahahahahahahaha this is the most funny film I have ever seen"

# In[10]:

file_data = None

with open(in_file, 'r') as f:
    file_data = f.read()

# In[11]:

parser = PlaintextParser.from_file(in_file, Tokenizer(LANGUAGE))

summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = get_stop_words('slovak')

helper = _summarizer.AbstractSummarizer()

# In[36]:

explanator = anchor_text.AnchorText(nlp, ['negative', 'positive'],
                                    use_unk_distribution=True)

# In[13]:

# define a decorator to log execusion time
# inspired by https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d

Beispiel #28
0
 def test_slovak_stemmer(self):
     expected = Stemmer("czech")
     actual = Stemmer("slovak")
     self.assertEqual(type(actual), type(expected))
     self.assertEqual(expected.__dict__, actual.__dict__)
Beispiel #29
0
def stem(word, LANGUAGE = "portuguese"):
    stemmer = Stemmer(LANGUAGE)
    return stemmer(to_unicode(word).lower())
Beispiel #30
0
def data_pre_train_mongo( data_path='data/data.json',train_path='data/train_db.txt' ):
    """
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)
    [unused5] 标记关键词
      [unused6]  标记标题
    [unused7]  标记前文标题
       [unused8]  标记正文
    """
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 10
    article_max_len=500
    ttext=tkitText.Text()

    jieba.load_userdict('dict.txt')
    jieba.analyse.set_stop_words('stopwords.txt')

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    # ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0")
    f1 = open(train_path,'w')
    # articles=[]
    tt=tkitText.Text()
    # 引入TF-IDF关键词抽取接口
    tfidf = analyse.extract_tags
    # 引入TextRank关键词抽取接口
    textrank = analyse.textrank
    #这里定义mongo数据
    client = pymongo.MongoClient("localhost", 27017)
    DB_kg_scrapy = client.kg_scrapy
    print(DB.name)
    q={}
    # print('q',q)
    tclass = classify(model_name_or_path='tkitfiles/check_pet',num_labels=10,device='cuda')
    Ner=get_ner()
    # nlp=Nlp()
    i=0
    # for item in DB_kg_scrapy.kg_content.find(q):
    tjson=tkitFile.Json(file_path=data_path)
    for item in tqdm(tjson.auto_load()):
        i=i+1
        if i%10000==0:
            print(i)
        # print(item)
        if len(item['content'])>500:
            SENTENCES_COUNT = 5
        else:
            SENTENCES_COUNT = 3
        parser = PlaintextParser.from_string(item['content'], Tokenizer(LANGUAGE))
        l=[]
        words_list=[]
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            l.append(str(sentence))
            # ner_list=Ner.pre(str(sentence))
            # for it in ner_list[0][1]:
            #     words_list.append(it.get("words"))
        # keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 
        keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False,) 
        keyphrases =tt.get_keyphrases(item['title']+'\n'+item['content'])

        # print("==="*20)
        # print("",item['title'])
        # print(item['content'][:100])
        p=tclass.pre(item['content'])
        # print("预测结果",p)
        # softmax=tclass.softmax()
        # print(softmax)
        # sentences=tt.sentence_segmentation_v1( item['title']+'。'+item['content'])
        # words_list=[]
        # for sentence in sentences:
        #     ner_list=Ner.pre(sentence)
        #     for it in ner_list[0][1]:
        #         words_list.append(it.get("words"))
        # # print(words_list)
        # keywords=keywords+keyphrases+words_list
        keywords=keywords+keyphrases
        keywords=list(set(keywords))
        # print(ner_list)
        content=" [KW] "+",".join(keywords)+" [/KW]  [TT] "+ item['title']+" [/TT] [SM] "+"".join(l)+" [/SM] [CONTNET] "+item['content']+" [/CONTNET] [PT] "+ item['title']+" [/PT] [END]"
        content=content.replace("\n\n\n", "\n\n")
        content=content.replace("\n", " [SEP] ")
        # print(content[:100])
        # content_list=cut_text(content,480)
        # for it in content_list:
        #     print("++++"*20)
        #     print(it)
        # f1.write("\n".join(content_list)+"")
        if p==1:
            f1.write(content)
            f1.write("\n")