Ejemplo n.º 1
0
def node_page():
    nid = request.args.get('id')
    KDB = client.kg_scrapy
    items = KDB.kg_content.find_one({'_id': nid})
    if items == None:
        return "没有内容"
    else:

        LANGUAGE = "chinese"
        SENTENCES_COUNT = 10
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        if len(items['content']) > 500:
            SENTENCES_COUNT = 5
        else:
            SENTENCES_COUNT = 3
        parser = PlaintextParser.from_string(items['content'],
                                             Tokenizer(LANGUAGE))
        summary = []

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary.append(str(sentence))
        titles = []
        titles_p = DB.pre_titles.find({"parent": items['_id']})
        for item in titles_p:
            irank, grade, softmax = get_rank(item['title'])
            # print(irank,grade,softmax)
            # print((items[i]))
            item['rank'] = irank
            item['softmax'] = softmax
            item['grade'] = grade
            titles.append(item)

        return render_template("node.html", **locals())
Ejemplo n.º 2
0
def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    result = ''
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result = result + ' ' + str(sentence)
        try:
            result = result + ' ' + str(sentence)

        except:
            print(
                '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n'
            )
            sys.stdout.flush()
            return (
                '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n'
            )
    print('\n\n' + str(url) + '\n\n' + str(result))
    sys.stdout.flush()
    return result
Ejemplo n.º 3
0
    def top_sentences(
        self, sentences: List[Sentence], num_sentences: int = 5, useLemma: bool = True
    ) -> List[Sentence]:

        text_sents = [s.text for s in sentences]
        lemma_sents = [l.lemmatized_text for l in sentences]

        ranked_sentences = []

        # pprint(sentences)

        summarizer = Summarizer()

        if not useLemma:
            doc = "\n".join(text_sents)

            # TODO custom sumy DocumentParser ?
            parser = PlaintextParser.from_string(doc, Tokenizer(self.language))

            summary = summarizer(parser.document, num_sentences)

            for summary_sentence in summary:
                for original in sentences:
                    if str(summary_sentence).strip() == original.text.strip():
                        ranked_sentences.append(original)
                        break
        else:
            #  spaCy and Sumy use different tokenizers, so they may do sentencizing slightly differently.
            #  When we compare "lemmatized" versions, they may not match 1:1 because of that,
            #  so unless we do a Sumy Parser implementation using the spaCy token/sentences
            # we may need to do a "fuzzy" match to find the original sentence with the "lemma" matching the summary sentence
            # "lemma"
            #

            doc = "\n".join(lemma_sents)
            parser = PlaintextParser.from_string(doc, Tokenizer(self.language))
            summary = summarizer(parser.document, num_sentences)

            for lemma_sentence in summary:
                for original in sentences:
                    summary_lemma = str(lemma_sentence).strip()
                    original_lemma = original.lemmatized_text.strip()
                    similarity = fuzz.ratio(summary_lemma, original_lemma)
                    # pprint(similarity)
                    if similarity >= 80:
                        ranked_sentences.append(original)
                        break

        if len(ranked_sentences) < num_sentences:
            log.warn(
                f"Only {len(ranked_sentences)} of {num_sentences} sentences ranked."
                "Text too short, or mismatching sentencizer (sumy/spaCy)"
            )

        return ranked_sentences
Ejemplo n.º 4
0
def summary(TEXT,LANGUAGE,SENTENCES_COUNT):
    parser = PlaintextParser.from_string(TEXT, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    resume = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        resume += str(sentence)
    return resume
def get_lexrank(tweets):
    sens = [Sentence(t, TwokenizeWrapper()) for t in tweets]
    tweet_document = ObjectDocumentModel([Paragraph(sens)])
    LANGUAGE = "english"
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    SENTENCES_COUNT = len(sens)
    lex_ranked = summarizer(tweet_document, SENTENCES_COUNT)
    if len(sens) != len(lex_ranked):
        print('lr error')
    return [lex_ranked[s] for s in sens]
Ejemplo n.º 6
0
    def __init__(self, **kwargs):
        super(SummarizationBaseline, self).__init__(**kwargs)
        self.spot_field = EVENT_SPOT_FIELD

        self.evaluator = SalienceEva(**kwargs)

        lang = 'english'
        stemmer = Stemmer(lang)
        self.summarizer = Summarizer(stemmer)
        self.summarizer.stop_words = get_stop_words(lang)

        self.h_event_id = pickle.load(open(self.event_id_pickle_in))
Ejemplo n.º 7
0
def summarize_text(text,
                   sentences_count=3,
                   language=settings.DEFAULT_LANGUAGE,
                   as_list=False):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)
    sentences = [
        unicode(sentence)
        for sentence in summarizer(parser.document, sentences_count)
    ]
    return sentences if as_list else '\n'.join(sentences)
Ejemplo n.º 8
0
    def summarize_from_file(self, file_name):

        parser = PlaintextParser.from_file(file_name, Tokenizer(self.LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = Summarizer(stemmer)
        file_1 = open("summarizer_output.txt", "w+")
        file_2 = open("summarizer_output2.txt", "w+")
        for sentence in summarizer(parser.document, self.SENTENCES_COUNT):
            file_2.write(str(sentence))
            file_1.write(str(sentence))
            file_1.write("\n")
        file_1.close()
        file_2.close()
def runsumy(method, num, ip_file_path, op_file_path):
    parser = PlaintextParser.from_file(ip_file_path, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    # f = codecs.open(op_file_path, 'w', 'utf-8')
    s = ""
    for word in summarizer(parser.document, int(num)):
        s += word._text.encode('utf-8').decode('utf-8')
        # print(word._text.encode('utf-8'), file=f) # not outputing in the designated file
    return s
Ejemplo n.º 10
0
def summarize_sumy(doc):
    summary = ""
    file_doc = open("temp.txt", "w", encoding = 'utf-8')
    file_doc.write(doc)
    file_doc.close()

    parser = PlaintextParser.from_file("temp.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary += str(sentence) + ' '

    return summary
Ejemplo n.º 11
0
def summarize_file(file_name):
	#url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
	#parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
	# or for plain text files
	parser = PlaintextParser.from_file(file_name, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)

	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	
	sentences = summarizer(parser.document, SENTENCES_COUNT)
	list_sentences = []
	for sentence in sentences:
		list_sentences.append(str(sentence))
	return list_sentences
Ejemplo n.º 12
0
def load():
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 5
    # article_max_len=500
    tt=tkitText.Text()
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    jieba.load_userdict('dict.txt')
    jieba.analyse.set_stop_words('stopwords.txt')
    textrank = jieba.analyse.textrank

    w2v=tkitW2vec.Word2vec()
    w2v.load(model_file=Word2vec_model)
Ejemplo n.º 13
0
def summarize():
    SENTENCES_COUNT = numOfSent.get()
    parser = PlaintextParser.from_file(fileName.cget("text"),
                                       Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    outputFile = open("C://Users//rakesh chandra//Desktop//ATS//output.txt",
                      'w')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        outputFile.write("->  ")
        outputFile.write(str(sentence))
        outputFile.write("\n \n")
    os.startfile((fileName.cget("text")))
    os.startfile("C://Users//rakesh chandra//Desktop//ATS//output.txt")
Ejemplo n.º 14
0
def summarize(urls, num_sentences):
    documents = DocumentCollection()
    lang = "english"
    summary = ""

    for url in urls:
        try:
            parser = HtmlParser.from_url(url, Tokenizer(lang))
            documents.add(parser.document)
        except:
            print("HTTP ERROR @ " + url)

    stemmer = Stemmer(lang)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    for sentence in summarizer(documents, num_sentences):
        summary += unicode(sentence) + " "
    return summary
Ejemplo n.º 15
0
    def scrape(self, url):
        complete_url = url
        try:
            # get summary
            print "Retrieving page summary of %s... " % url

            parser = HtmlParser.from_url(complete_url, Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)

            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)

            url_summary = ''.join(
                str(sentence)
                for sentence in summarizer(parser.document, SENTENCES_COUNT))

        except Exception, e:
            url_summary = "Could not scrape summary. Reason: %s" % e.message
Ejemplo n.º 16
0
    def summarizeSections(self):
        for section, paragraphs in self.sections.items():
            # Set summary length of section to be proportional to complete
            # length of section
            print(section)
            summaryLength = round(
                self.lengths[section] / self.getTotalLength() * self.summaryLength)
            print(summaryLength)
            doc = '  '.join(paragraphs)
            parser = PlaintextParser.from_string(doc, Tokenizer('english'))
            stemmer = Stemmer('english')

            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words('english')
            summ = summarizer(parser.document, summaryLength)

            self.sections[section] = [str(sentence) for sentence in summ]
        print('done with summary')
Ejemplo n.º 17
0
    def prosess_content(self, url):
        article = Article(url)

        article.download()
        # article.html
        article.parse()

        dbthings = db_things.DBThings()
        parser = Parser()

        if article.authors:
            self.authors = ','.join(map(str, article.authors))
        if article.keywords:
            self.keywords = ','.join(map(str, article.keywords))

        publish_date = articleDateExtractor.extractArticlePublishedDate(
            article.url)
        # time.sleep(5)

        parser = HtmlParser.from_url(url, Tokenizer('english'))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer('english')

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words('english')

        all_sentences = ''

        for sentence in summarizer(parser.document, 10):
            all_sentences += (sentence._text + '\n')

        # TODO: Pay for better license to speed up this process
        # time.sleep(80)
        # classifier = Classifier()
        # category = classifier.classify_news(data=all_sentences)
        category = 'General'

        if publish_date is not None:
            dbthings.insert_extracted(self.authors, str(publish_date),
                                      all_sentences.encode('utf-8', 'ignore'),
                                      article.top_image, self.keywords,
                                      article.url, article.title, category)
        return
Ejemplo n.º 18
0
    def __init__(self, method=None, nltk_directory=None, language=None):
        if (language):
            logger.info("Setting language to " + language)
            LANGUAGE = language
        else:
            LANGUAGE = "english"
        # Set the location of the nltk data directory for tokenizers, etc.
        if nltk_directory:
            nltk.data.path.append(nltk_directory)
            logger.info(nltk.data.path)
        try:
            self.stemmer = Stemmer(LANGUAGE)
        except Exception:
            logger.exception("Error loading nltk stemmer")
            raise Exception("Error loading nltk stemmer")

        self.summarizer = Summarizer(self.stemmer)  # default
        if method:
            if (method == 'luhn'):
                logger.info("Using the Luhn summarizer!")
                self.summarizer = LuhnSummarizer(self.stemmer)
            elif (method == 'edmundson'):
                logger.info("Using the Edmundson summarizer!")
                self.summarizer = EdmundsonSummarizer(self.stemmer)
            elif (method == 'lsa'):
                logger.info("Using the LSA summarizer!")
                self.summarizer = LsaSummarizer(self.stemmer)
            elif (method == 'text_rank'):
                logger.info("Using the Text Rank summarizer!")
                self.summarizer = TextRankSummarizer(self.stemmer)
            elif (method == 'sum_basic'):
                logger.info("Using the Sum Basic summarizer!")
                self.summarizer = SumBasicSummarizer(self.stemmer)
            elif (method == 'kl'):
                logger.info("Using the KL summarizer!")
                self.summarizer = KLSummarizer(self.stemmer)
            elif (method == 'lex_rank'):
                logger.info("Using the LexRank summarizer!")
                self.summarizer = LexRankSummarizer(self.stemmer)
        #print(method)
        self.summarizer.stop_words = get_stop_words(LANGUAGE)
Ejemplo n.º 19
0
def main():
    LANGUAGE = "english"
    SENTENCES_COUNT = 2
    stop = set(stopwords.words('english'))

    #retrieve each of the articles
    articles = os.listdir("../data/articles")
    count = 0
    for article in articles:
        stdout.write("\rProgress: {:02.0f}%".format(
            float(count) / len(articles) * 100))
        stdout.flush()
        # print 'Reading articles/' + article
        # articleFile = io.open('articles/' + article, 'r')
        parser = PlaintextParser.from_file(
            os.path.abspath(os.path.join("../data/articles", article)),
            Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        summary = ""
        file_name = os.path.splitext(article)[0].split('.')[0]
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary += str(sentence)

        summary_tokens = [
            token.lower().translate(None, punctuation)
            for token in word_tokenize(summary) if token not in punctuation
            and token.lower() not in stop and token != "'s"
        ]

        with open(os.path.join("results", file_name + ".txt"),
                  "w") as keywords_file:
            keywords_file.write('\n'.join(set(summary_tokens)))

        count += 1

    print "\nDone..."
Ejemplo n.º 20
0
def summary(request):
    keyW = None
    results = None
    form = summary_form(request.POST or None)
    if form.is_valid():
        SUMMARIZE = form.cleaned_data['SUMMARIZE']
        LANGUAGE = form.cleaned_data['LANGUAGE']
        SENTENCES_COUNT = form.cleaned_data['SENTENCES_COUNT']
        url = SUMMARIZE
        LANGUAGE = LANGUAGE
        SENTENCES_COUNT = SENTENCES_COUNT

        try:
            article = Article(url)
            article.download()
            article.parse()
            article.nlp()
            keyW = article.keywords
            parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
            # or for plain text files
            # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)

            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                print(sentence)

            results = summarizer(parser.document, SENTENCES_COUNT)

        except:
            keyW = 'Invalid Entry'
            results = '-'
            pass

    context = {'form': form, 'keywords': keyW, 'results': results}
    template = 'index.html'
    return render(request, template, context)
Ejemplo n.º 21
0
def auto_summarize_comment(request):
    
    from sumy.nlp.stemmers import Stemmer
    #from sumy.utils import get_stop_words
    from sumy.parsers.html import HtmlParser
    from sumy.nlp.tokenizers import Tokenizer
    #from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    #from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
    from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
         
    stemmer = Stemmer("english")
    summarizer = Summarizer(stemmer)
    
    comment_ids = request.POST.getlist('d_ids[]')
    
    sent_list = []
    
    for comment_id in comment_ids:
        comment = Comment.objects.get(id=comment_id)
        text = comment.text
        
        text = re.sub('<br>', ' ', text)
        text = re.sub('<BR>', ' ', text)
        
        parser = HtmlParser.from_string(text, '', Tokenizer("english"))
        
        num_sents = request.GET.get('num_sents', None)
        if not num_sents:
            all_sents = parser.tokenize_sentences(text)
            num_sents = floor(float(len(all_sents))/3.0)
        
        sents = summarizer(parser.document, num_sents)
         
        
        for sent in sents:
            sent_list.append(sent._text)
     
    return JsonResponse({"sents": sent_list})
Ejemplo n.º 22
0
def sumy_test(url):
    summary = open("summary_list.txt", "a", encoding='utf-8-sig')
    sys.stdout = summary

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    print("Sumy_FiveLine:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT - 5):
        print(sentence)
    print("\n")

    print("Sumy_TenLine:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        print("")
    print("\n")

    summary.close()
Ejemplo n.º 23
0
def testSummarize():
    txt = '''
        "As complexity rises , precise statements lose meaning and meaningful statements lose precision . ( Albert Einstein ) .", 
        "Fuzzy logic deals with reasoning that is approximate rather than fixed and exact . This may make the reasoning more meaningful for a human :", 
        "", 
        "", 
        "I 've written a short introduction to fuzzy logic that goes into a bit more details but should be very accessible .", 
        "Fuzzy logic seems to have multiple of applications historically in Automotive Engineering .", 
        "I found an interesting article on the subject from 1997 . This excerpt provides an interesting rationale :", 
        "Here are some papers and patents for automatic transmission control in motor vehicles . One of them is fairly recent :", 
        "Automatic Transmission Shift Schedule Control Using Fuzzy Logic SOURCE : Society of Automotive Engineers , 1993", 
        "Fuzzy Logic in Automatic Transmission Control SOURCE : International Journal of Vehicle Mechanics and Mobility , 2007", 
        "Fuzzy control system for automatic transmission | Patent | 1987", 
        "Transmission control with a fuzzy logic controller | Patent | 1992", 
        "", 
        "Likewise with fuzzy logic anti-lock breaking systems ( ABS ) :", 
        "Antilock-Braking System and Vehicle Speed Estimation using Fuzzy Logic SOURCE : FuzzyTECH , 1996", 
        "Fuzzy Logic Anti-Lock Break System SOURCE : International Journal of Scientific & Engineering Research , 2012", 
        "Fuzzy controller for anti-skid brake systems | Patent | 1993", 
        "", 
        "This method seems to have been extended to aviation :", 
        "A Fuzzy Logic Control Synthesis for an Airplane Antilock-Breaking System SOURCE : Proceedings of the Romanian Academy , 2004", 
        "Landing gear method and apparatus for braking and maneuvering | Patent | 2003", 
        ""
    '''
    texts = []
    for p in txt.split("\n"):
        texts.append("<p>" + p + "</p>")
    txt = " ".join(texts)
    from sumy.parsers.html import HtmlParser
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    #from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words
    #from sumy.summarizers.kl import KLSummarizer as Summarizer
    from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
    from post_rec.Utility.TextPreprocessing import PreprocessPostContent
    from textblob import TextBlob
    LANGUAGE = "english"

    pros = PreprocessPostContent()
    #url = "https://github.com/miso-belica/sumy"
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    texts = pros.getPlainTxt(txt)
    #print(TextBlob(txt).sentences)
    print(len(texts))
    [print("#p=>", p) for p in texts]
    SENTENCES_COUNT = len(texts)

    document = []
    for i in range(len(texts)):
        document.append(texts[i])
        document.append("")
    document = "\n".join(document)
    print(document)
    parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Ejemplo n.º 24
0
            requests.get(url)
            doable = True
        except Exception as x:
            print(x)
            doable = False
            t1 = time.time()
            print('Took', t1 - t0, 'seconds')

        if (doable == True and requests.head(url).status_code == 200):
            parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
            if len(parser.document.sentences) != 0:
                # or for plain text files
                # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
                stemmer = Stemmer(LANGUAGE)

                summarizer = Summarizer(stemmer)
                summarizer.stop_words = get_stop_words(LANGUAGE)
                sen = []

                for sentence in summarizer(parser.document, SENTENCES_COUNT):
                    sen.append(str(sentence))

                text = '.'.join(sen)
                text.replace(u'\n', '')
                obj = {}
                obj['ID'] = str(i)
                obj['TextRank'] = text

                #drop_falsey = lambda path, key, value: bool(value)
                #clean = remap(obj, visit=drop_falsey)
Ejemplo n.º 25
0
def data_pre_train_mongo( data_path='data/data.json',train_path='data/train_db.txt' ):
    """
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)
    [unused5] 标记关键词
      [unused6]  标记标题
    [unused7]  标记前文标题
       [unused8]  标记正文
    """
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 10
    article_max_len=500
    ttext=tkitText.Text()

    jieba.load_userdict('dict.txt')
    jieba.analyse.set_stop_words('stopwords.txt')

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    # ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0")
    f1 = open(train_path,'w')
    # articles=[]
    tt=tkitText.Text()
    # 引入TF-IDF关键词抽取接口
    tfidf = analyse.extract_tags
    # 引入TextRank关键词抽取接口
    textrank = analyse.textrank
    #这里定义mongo数据
    client = pymongo.MongoClient("localhost", 27017)
    DB_kg_scrapy = client.kg_scrapy
    print(DB.name)
    q={}
    # print('q',q)
    tclass = classify(model_name_or_path='tkitfiles/check_pet',num_labels=10,device='cuda')
    Ner=get_ner()
    # nlp=Nlp()
    i=0
    # for item in DB_kg_scrapy.kg_content.find(q):
    tjson=tkitFile.Json(file_path=data_path)
    for item in tqdm(tjson.auto_load()):
        i=i+1
        if i%10000==0:
            print(i)
        # print(item)
        if len(item['content'])>500:
            SENTENCES_COUNT = 5
        else:
            SENTENCES_COUNT = 3
        parser = PlaintextParser.from_string(item['content'], Tokenizer(LANGUAGE))
        l=[]
        words_list=[]
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            l.append(str(sentence))
            # ner_list=Ner.pre(str(sentence))
            # for it in ner_list[0][1]:
            #     words_list.append(it.get("words"))
        # keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 
        keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False,) 
        keyphrases =tt.get_keyphrases(item['title']+'\n'+item['content'])

        # print("==="*20)
        # print("",item['title'])
        # print(item['content'][:100])
        p=tclass.pre(item['content'])
        # print("预测结果",p)
        # softmax=tclass.softmax()
        # print(softmax)
        # sentences=tt.sentence_segmentation_v1( item['title']+'。'+item['content'])
        # words_list=[]
        # for sentence in sentences:
        #     ner_list=Ner.pre(sentence)
        #     for it in ner_list[0][1]:
        #         words_list.append(it.get("words"))
        # # print(words_list)
        # keywords=keywords+keyphrases+words_list
        keywords=keywords+keyphrases
        keywords=list(set(keywords))
        # print(ner_list)
        content=" [KW] "+",".join(keywords)+" [/KW]  [TT] "+ item['title']+" [/TT] [SM] "+"".join(l)+" [/SM] [CONTNET] "+item['content']+" [/CONTNET] [PT] "+ item['title']+" [/PT] [END]"
        content=content.replace("\n\n\n", "\n\n")
        content=content.replace("\n", " [SEP] ")
        # print(content[:100])
        # content_list=cut_text(content,480)
        # for it in content_list:
        #     print("++++"*20)
        #     print(it)
        # f1.write("\n".join(content_list)+"")
        if p==1:
            f1.write(content)
            f1.write("\n")
Ejemplo n.º 26
0
def data_pre_train( data_path='data/data.json',train_path='data/train.txt' ):
    """
    from=0  #文章开始id
    limit=10 # 返回文章数目
    >>>data_pre_train(from=0, limit=10)
    [unused5] 标记关键词
      [unused6]  标记标题
    [unused7]  标记前文标题
       [unused8]  标记正文
    """
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 10
    article_max_len=500
    # tjson=tkit.Json(file_path=data_path)
    # data=tjson.auto_load()
    # print(len(data))
    ttext=tkitText.Text()
    # extractor = tkit.TripleExtractor()
    # if len(data)>tfrom+limit:
    #     data=data[tfrom:tfrom+limit]
    # elif len(data)<tfrom:
    #     print("数据过短了,存在问t")
    #     return []
    # else:
    #     data=data[tfrom:]
    # for item in tjson.auto_load():
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0")
    f1 = open(train_path,'w')
    articles=[]
    # 引入TF-IDF关键词抽取接口
    tfidf = analyse.extract_tags
    # 引入TextRank关键词抽取接口
    textrank = analyse.textrank
    with open(data_path, 'r', encoding = 'utf-8') as data:
        for art_i,it in tqdm(enumerate(data)):
            item=json.loads(it[:-1])
            # if art_i%10==0:
            #     print('arti', art_i)
            segs_pre=[]
            segs_end=[]
            # # segs_pre.append(' [KW] '+item['keywords']+' [SEP] ')
            # # l=ttext.summary( item['content'],num=10)
            # # extractor = tkit.TripleExtractor()
            # # svos = extractor.triples_main(item['content'])

            # # extractor.clear()
            # # print('svos', svos)
            # parser = PlaintextParser.from_string(item['content'], Tokenizer(LANGUAGE))
            # l=[]
            # for sentence in summarizer(parser.document, SENTENCES_COUNT):
            #     l.append(str(sentence))
            # # del sentence
            s=[]      




            # # 这里开始处理关键词 关键语句等信息 
            # try:
            #     for it in ie.get(item['title']+'\n'+item['content']):
            #         # print(it)
            #         if it==None:
            #             pass
            #         else:
            #             s.append(''.join(list(it)))
            #     # print(s)
            # except:
            #     pass
            # # s=get_seq(item['title']+'\n'+item['content'])
            # # 基于TextRank算法进行关键词抽取
            keywords = textrank(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 
            # # 输出抽取出的关键词
            # # print(keywords)
            # # for keyword in keywords:
            # #     print (keyword + "/",)
            # # 基于TF-IDF算法进行关键词抽取
            # # keywords = tfidf(item['title']+'\n'+item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
            # # print(keywords)
            # # 输出抽取出的关键词
            # # for keyword in keywords:
            # #     print( keyword + "/",)
            # # keywords1 =ttext.get_keywords(item['title']+'\n'+item['content'])
            # # new_keywords=[]
            # # for keyword in keywords1:
            # #     new_keywords.append(keyword['word'])        
            # # keywords =ttext.get_keyphrases(item['title']+'\n'+item['content'])
            # # kws=keywords+new_keywords
            # # # s.append(','.join(kws))
            # s=[','.join(keywords)]+s
            segs_pre.append(' [KW] '+','.join(keywords)+' [/KW] ')
            # del s
            # # svos = extractor.triples_main('。'.join(l))
            # #这里的屏蔽内容









            try:
                segs_pre.append(' [TT] '+item['title']+" [/TT] ")
                segs_end.append(' [PT] '+item['title']+" [/PT] ")
            except:
                pass
            segs=sentence_seg(" [CLS] "+item['content']+" [END] ")
            article="".join(segs_pre+segs+segs_end)
            
            one=[]
            for i in range(len(article)//article_max_len+1):
                #截取内容
                one.append(article[i*article_max_len:(i+1)*article_max_len]+"")
            articles.append("\n".join(one)+"")
            if art_i%100==0:
                print('arti', art_i)
                # f1.write("\n\n".join(articles)+"\n\n")
                f1.write("\n\n".join(articles)+"")
                articles=[]
            # del articles
            del segs
        f1.write("\n\n".join(articles)+"")
        f1.close()
        gc.collect()
        del stemmer
        del summarizer
        del ie


        gc.collect()
        return
Ejemplo n.º 27
0
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


def ticker_from_name(name):
    df = pd.read_csv('static/companylist.csv').dropna()
    return (df.Symbol[process.extractOne(name, choices=df.Name)[2]])


links = pd.read_csv('URLsWithDomains.csv').URL
url = links[0]
r = requests.get(url, allow_redirects=True)  # to get content after redirection
pdf_url = r.url  # 'https://media.readthedocs.org/pdf/django/latest/django.pdf'
with open('file_name.pdf', 'wb') as f:
    f.write(r.content)
text = ''
for i in range(fileReader.numPages):
    try:
        text = text + fileReader.getPage(i).extractText()
        text = text + "\n"
    except:
        print("no!")
LANGUAGE = 'english'
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = Summarizer()
summary = summarizer(parser.document, 50)
with open('output.txt', 'w') as file:
    for sentence in summary:
        file.writelines(str(sentence))
        file.writelines("\n")
Ejemplo n.º 28
0
 def __init__(self):
     self.stemmer = Stemmer(LANGUAGE)
     self.summarizer = Summarizer(self.stemmer)
     self.summarizer.stop_words = get_stop_words(LANGUAGE)