Python Rake.get_ranked_phrases_with_scores Beispiele, rake_nltk.Rake.get_ranked_phrases_with_scores Python Beispiele

Beispiel #1

0

Datei anzeigen

def rake():
    from rake_nltk import Rake, Metric
    #r = Rake()
    r = Rake(ranking_metric=Metric.WORD_FREQUENCY)
    words = ""
    for disc in discs:
        words = words + ". " + disc
    r.extract_keywords_from_text(words)
    print r.get_ranked_phrases_with_scores()
    '''

Beispiel #2

0

Datei anzeigen

Datei: lucrarefinal.py Projekt: Ulmanu/UniverAn3_Sem1

def sheet(text):
    #adresa site-ului, adaugam xls
    doclink = "http://www.scholarpedia.org" + text
    print(doclink)
    wb = Workbook()
    sheet1 = wb.add_sheet('Sheet 1')
    headers1 = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }
    document = Document()
    link1 = requests.get(doclink, headers=headers1).text
    soup1 = BeautifulSoup(link1, 'lxml')
    #extragem info din tagul p si h1
    selectall2 = soup1.find_all("p")
    selecttitle = str(*soup1.find_all("h1"))
    print(selecttitle)
    #print(*selectall,sep='\n')
    stri2 = ""
    i = 0
    for lin in selectall2:
        stri2 += str(lin)
        i = i + 1
    #filtram textul de taguri html
    clean = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    stri2 = re.sub(clean, '', stri2)
    sheet1.write(0, 0, 'Cuvinte Cheie')
    sheet1.write(0, 1, 'Rake Rank')
    #utilizam algortimtul rake
    r = Rake()
    r.extract_keywords_from_text(stri2)
    print("\n".join(r.get_ranked_phrases()))
    print(*r.get_ranked_phrases_with_scores(), sep='\n')
    #rank,key=r.get_ranked_phrases_with_scores()
    rank = r.get_ranked_phrases_with_scores()
    print(rank)
    word = [0 for x in range(len(rank))]
    ranked = [0 for x in range(len(rank))]
    j = 0
    for khh in rank:
        ranked[j], word[j] = khh
        j = j + 1
    #inscriem in xls
    for g in range(len(ranked)):
        sheet1.write(g + 1, 0, word[g])
        sheet1.write(g + 1, 1, ranked[g])
    if path.exists("key.xls"):
        os.remove("key.xls")
        wb.save('key.xls')
    else:
        wb.save('key.xls')

Beispiel #3

0

Datei anzeigen

Datei: main.py Projekt: derekli-NJ/SIGNLL-Summarization-SP-19

def summarize(text):
    # sent_tokenize breaks target text into sentences.
    blocks = [nltk.sent_tokenize(paragraph) for paragraph in text.split("\n")]
    sentence_list = []
    for block in blocks:
        sentence_list.extend(block)

    r = Rake(min_length=2)
    r.extract_keywords_from_text(text)
    #temp_keywords = r.get_ranked_phrases_with_scores()[:10]
    #keywords = [x[1] for x in temp_keywords]
    keyphrases = [list(x) for x in r.get_ranked_phrases_with_scores()[:20]]

    for phrase in keyphrases:
        print(phrase)

    print()

    important_sentences = [sentence_list[0]]

    #prev_sentence = ""
    for i in sentence_list:
        is_important = False
        for j in keyphrases:
            for word in j[1:]:
                if (word in i.lower()):
                    is_important = True
                    break
        #if is_important and prev!= None and i not in important_sentences:
        #    important_sentences.append(prev + "\n" + i)
        #prev = i
        if is_important and not (i in important_sentences):
            important_sentences.append(i)

    return "\n".join(important_sentences)

Beispiel #4

0

Datei anzeigen

class TextAnalyser(object):
    def __init__(self):
        self.threshold = 0.99
        self.__rake = Rake()
        self.__stemmer = LancasterStemmer()
        self.__lemma = WordNetLemmatizer()
        self.__stopwords = ['alt']
        pass

    def extract(self, text):
        self.__rake.extract_keywords_from_text(text.strip())
        scores = self.__rake.get_ranked_phrases_with_scores()
        keywords = self.unpack_keywords(scores)
        words = filter(lambda x: x[1] not in self.__stopwords and x[1].isalnum(), keywords)

        filtered_words = map(lambda x: x[1], filter(lambda x: x[0] > self.threshold, words))

        lemms = map(lambda x: self.__lemma.lemmatize(x), filtered_words)
        stems = map(lambda x: self.__stemmer.stem(x), lemms)

        return stems

    @staticmethod
    def unpack_keywords(keywords):
        words = []

        for k in keywords:
            for p in k[1].split(' '):
                words.append((k[0], p))

        return words

Beispiel #5

0

Datei anzeigen

Datei: extractor.py Projekt: KKodiac/TextWikiCrawler

class Extractor:
    def __init__(self, topic, MIN_LENGTH=1, MAX_LENGTH=4):
        self.proc = Processor(topic)
        self.raker = Rake(min_length=MIN_LENGTH, max_length=MAX_LENGTH)
        self.topic = topic

    def extract_keywords(self):
        self.proc.processor()
        file = open(self.proc.btoken_file_path, 'r')
        text = file.read()
        self.raker.extract_keywords_from_text(text)
        file.close()

    def extract_keywords_with_scores(self):
        text_with_scores = self.raker.get_ranked_phrases_with_scores()

        return text_with_scores

    def load_to_data(self):
        pair = list()
        text_score = self.extract_keywords_with_scores()
        RAKE_file = open("../DataFile/RAKE/" + self.topic + ".json", 'w+')
        for c in text_score:
            temp = [("score", c[0]), ("text", c[1])]
            pair.append(dict(temp))
        print()
        json.dump(pair, RAKE_file, indent=4)

Beispiel #6

0

Datei anzeigen

Datei: getsignificant.py Projekt: Hongkong18/team-8

def GetSignificant(mytext):
    r = Rake()
    #Internal Testing
    r.extract_keywords_from_text(mytext)

    listSig = {}
    #Get extracted scores and phrases
    extractedScores = r.get_ranked_phrases_with_scores()

    #calculate Average
    scoreAvg = 0
    for score, phrase in extractedScores:
        scoreAvg += score
    scoreAvg = scoreAvg / len(extractedScores)
    #adding this causes problems with all equal scores
    #scoreAvg += scoreAvg/2
    for score, phrase in extractedScores:
        if phrase.count(' ') > 0 and score >= scoreAvg:
            for word in phrase.split():
                tmp = wn.synsets(word)[0].pos()
                #print (word, ":", tmp)
            listSig.update({phrase: score})

        elif score >= scoreAvg:
            tmp = wn.synsets(phrase)[0].pos()
            #print (phrase, ":", tmp)
            listSig.update({phrase: score})
            #swntmp = phrase + "." + tmp + ".01"
            #print (swn.senti_synset(swntmp))
    s = " "
    return s.join(list(listSig))

Beispiel #7

0

Datei anzeigen

    def test_extract_keywords_from_text(self):
        r = Rake()

        text = '''Compatibility of systems of linear constraints over the set of
        natural numbers. Criteria of compatibility of a system of linear
        Diophantine equations, strict inequations, and nonstrict inequations are
        considered. Upper bounds for components of a minimal set of solutions
        and algorithms of construction of minimal generating sets of solutions
        for all types of systems are given. These criteria and the corresponding
        algorithms for constructing a minimal supporting set of solutions can be
        used in solving all the considered types of systems and systems of mixed
        types.'''

        r.extract_keywords_from_text(text)

        ranked_phrases = [
            'minimal generating sets', 'linear diophantine equations',
            'minimal supporting set', 'minimal set', 'linear constraints',
            'upper bounds', 'strict inequations', 'nonstrict inequations',
            'natural numbers', 'mixed types', 'corresponding algorithms',
            'considered types', 'set', 'types', 'considered', 'algorithms',
            'used', 'systems', 'system', 'solving', 'solutions', 'given',
            'criteria', 'construction', 'constructing', 'components',
            'compatibility'
        ]
        self.assertEqual(r.get_ranked_phrases(), ranked_phrases)
        self.assertEqual(
            [phrase for _, phrase in r.get_ranked_phrases_with_scores()],
            ranked_phrases)

Beispiel #8

0

Datei anzeigen

Datei: LSI.py Projekt: n-witt/Collection-Document-Summaries-ECIR-2018

def keyphrases(text, mu=2, sig=1.5):
    """
    determines and ranks keyphrases from `text`. the keyphrases are
    weighted such that short keyphrases (2-3 words) are preferred. 
    moveover, keyphrases not adhering the rules defined in
    `filter_by_pos` are abandoned. 
    """
    assert type(text) is list, "the text is not a list"
    r = Rake(punctuations=". , ? ! - : ; \" \' ( )".split(),
             language='english')
    try:
        text[0].index(" ")
    except ValueError:
        pass
    else:
        raise ValueError("expecting a list a strings not a single string")

    text = " ".join(text)
    r.extract_keywords_from_text(text)

    # the scores are weighted by their length (# tokens)
    # using a normal distribution
    n = scipy.stats.norm(mu, sig)
    scores = r.get_ranked_phrases_with_scores()
    scores = [(s * n.pdf(len(f.split())), f) for s, f in scores]

    scores = sorted(scores, key=lambda x: -x[0])
    return filter_by_pos(scores)

Beispiel #9

0

Datei anzeigen

def get_key_ngrams(document,
                   max_grams=3,
                   include_numbers=False,
                   single_letters=False):
    # Uses stopwords for english from NLTK, and all puntuation characters.
    r = Rake()
    # r=Rake(<language>) # To use it in a specific language supported by nltk.
    # If you want to provide your own set of stop words and punctuations to
    # r = Rake(<list of stopwords>, <string of puntuations to ignore>)
    if not include_numbers:
        document = remove_punc_and_nums(document)  # numbers are replaced by NN

    document = document.lower()

    # lemmatize
    document = lemmatize(document)
    r.extract_keywords_from_text(document)

    # To get keyword phrases ranked highest to lowest.
    phrases = r.get_ranked_phrases_with_scores()

    data = {'{}grams'.format(x + 1): [] for x in range(max_grams)}

    for score, phrase in phrases:
        if 'NN' in phrase.upper():
            continue
        splitted = phrase.split()
        if any(map(lambda x: len(x) < 3, splitted)):
            continue
        score = round(score, 2)
        length = len(splitted)
        if length <= max_grams:
            data['{}grams'.format(length)].append((phrase, score))
    return data

Beispiel #10

0

Datei anzeigen

    def update_facet_dict(self, sentence):
        """
        Args:
        sentence

        Rets:
        Nothing
        """
        non_facet_noun = self._non_facet_noun
        r = Rake()
        r.extract_keywords_from_text(sentence)
        dat = r.get_ranked_phrases_with_scores(
        )  # extract the (key phrases, score) pair
        idx = self.calc_num_notes(
        ) - 1  # index of the added sentence, aka current sentence
        tmp = []
        for pair in dat:
            score = pair[0]
            phrase = nlp(pair[1])
            tmp.extend([
                (token.lemma_, (idx, score)) for token in phrase
                if token.pos_ == "NOUN" and token.lemma_ not in non_facet_noun
            ])

        # d2.update({k:v for k,v in d1.iteritems() if v})
        self._facet_dict.update(
            dict(tmp)
        )  # update the dict with {lemma_word: (idx, score)} # can use score in an advanced version

Beispiel #11

0

Datei anzeigen

Datei: controller.py Projekt: quillio307/quillio

def get_topics(meeting_id):
    """ generates topics for the meeting with the given id """
    meeting = Meeting.objects.with_id(meeting_id)
    string = ""
    for transcript in meeting.transcript:
        string += transcript.transcription + " "
    print(string)
    r = Rake()  # initializes Rake with English (all punc) as default lang
    r.extract_keywords_from_text(string)

    topic_data = r.get_ranked_phrases_with_scores()
    count = 0
    data = []
    for topic in topic_data:
        if topic[0] < 5 or count == 10:
            break
        else:
            data.append(str(topic[1]))
            count = count + 1

    return_data = " ".join(data).split(" ")
    no_reps = []
    for d in return_data:
        if d not in no_reps:
            no_reps.append(d)
    meeting.topics = no_reps
    meeting.save()
    return redirect(url_for('meetings.edit_meeting', id=meeting_id))

Beispiel #12

0

Datei anzeigen

Datei: pdfReader.py Projekt: jonbrig95/pdfProc

def keywordRake(fullText):
    r = Rake("stopList.txt")
    a = r.extract_keywords_from_text(fullText)
    b = r.get_ranked_phrases()
    c = r.get_ranked_phrases_with_scores()
    print(b)
    print(c)

Beispiel #13

0

Datei anzeigen

def get_rake_keyphrases_from_text(text, stopwords=None, printset = string.printable):
    if stopwords is None: 
        stopwords = get_word_list_from_file("Stopwords.txt")
    rake_object = Rake(stopwords = stopwords)
    rake_object.extract_keywords_from_text(text)
    rake_keywords = rake_object.get_ranked_phrases_with_scores()
    return rake_keywords

Beispiel #14

0

Datei anzeigen

def gen_keywords(body, max_keywords=-1):
    r = Rake(min_length=1, max_length=1)
    r.extract_keywords_from_text(body)
    # Returns a rank and a phrase as a list of tuples
    # (rank <float>, phrase <string>)
    phrases = r.get_ranked_phrases_with_scores()
    return [phrase for (rank, phrase) in phrases if rank > 0.8]

Beispiel #15

0

Datei anzeigen

Datei: preprocess.py Projekt: kunalBhashkar/keywords_based_search_engine_using_NLP

 def parsed_observation(input_text):
     '''Method to parse Observation from Observation sheet or Nature of check from reference checklist sheet'''
     from rake_nltk import Rake
     import re
     import string
     # Uses stopwords for english from NLTK, and all puntuation characters by
     # default
     r = Rake()
     text=str(input_text)
     #pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
     #text = pattern.sub('', text)
     #input_str = text.lower()    
     result1 = re.sub(r'\d+','', text)    
     result2 = result1.translate(str.maketrans("","",string.punctuation))
     r.extract_keywords_from_text(result2)
     # Extraction given the list of strings where each string is a sentence.
     #r.extract_keywords_from_sentences(<list of sentences>)
     # To get keyword phrases ranked highest to lowest.
     r.get_ranked_phrases()
     # To get keyword phrases ranked highest to lowest with scores.
     keywords_ranked_phrases=r.get_ranked_phrases_with_scores()
     my_list=[]
     my_list_final=[]
     for (i,j) in keywords_ranked_phrases:       
         my_list.append(j)
     pattern = "[0-9,).(!?]*"
     my_list_new = [re.sub(pattern, '', i) for i in my_list]
     while("" in my_list_new) : 
         my_list_new.remove("")
     for word in my_list_new:
         if len(word)>1:
             my_list_final.append(word)
         else:
             continue
     return sorted(list(set(my_list_final)),key=len,reverse=True)

Beispiel #16

0

Datei anzeigen

    def process_text(self):
        # Remove new lines and turn to lower case
        text = re.sub('\n', ' ', self.text).lower()

        # Extract keyphrases using Rake
        # TODO also possible to extract keywords from sentence
        rake = Rake()
        if self.text_type == 'article':
            rake.extract_keywords_from_text(text)
        elif self.text_type == 'social':
            rake.extract_keywords_from_sentences(text)
        self.all_phrases = rake.get_ranked_phrases_with_scores()
        # word_freq_dist = rake.get_word_frequency_distribution()

        # Tokenize text
        self.article_text_tokenized = word_tokenize(text)

        # Tokenize phrases
        self.all_phrases_tokenized = self.tokenize_phrases()

        # Tag all phrases and remove all but noun words
        self.all_phrases_tagged = pos_tag_phrase_pairs(self.all_phrases)
        self.all_phrases_tagged_nouns = filter_pos(self.all_phrases_tagged,
                                                   "nouns")

        # Convert list of tagged nouns back to a string phrase
        self.string_phrases_nouns = self.tuple_list_to_string_list()

Beispiel #17

0

Datei anzeigen

Datei: views.py Projekt: 96Kartik/Kartext-Analyzer

def conext(request):
    #import pdb;pdb.set_trace()
    extracted_list = list()
    csrfContext = RequestContext(request)
    if request.method == 'POST':

        try:
            json_data = json.loads(request.body)
            text = json_data.get('sentence')

        except:
            text = []
        if text:
            r = Rake()
            r.extract_keywords_from_text(text)
            extracted_list = r.get_ranked_phrases_with_scores()
            dataBounding = DataBounding(text, extracted_list)
            response_string = ''
            for item in extracted_list:
                value = '<i class="fa fa-dot-circle-o" aria-hidden="true"></i> ' + 'Keyword: ' + str(
                    item[1].encode('utf8')) + ' | Score: ' + str(
                        item[0]) + '<br>'
                response_string += value
            string_json = {}
            string_json['keywordswithscores'] = response_string

        else:
            pass
            #extracted_list.update({'Error':'Error occured at service response.'})
    return JsonResponse(string_json, safe=False)

Beispiel #18

0

Datei anzeigen

def get_all_keywords(text, word_boolean):

    if word_boolean:  # word - i.e. length should be exactly 1
        r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, max_length=1)
    else:  # phrase, i.e. made up of multiple words
        r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, min_length=2)

    total_phrases = []

    for local_text in text:
        r.extract_keywords_from_text(
            local_text)  # use the Rake instance to find the keywords
        local_phrases = r.get_ranked_phrases_with_scores(
        )  # apply these keywords to a local variable
        total_phrases.append(local_phrases)

    all_keywords = []

    # get_ranked_phrases_with_scores gives a list of tuples - tuple[0] is score, 1 is word
    for list in total_phrases:
        for tuple in list:
            all_keywords.append(tuple[1])

    # clean some of the keywords - don't want a single number or small/uninformative words
    # amount goes from 200 keywords to 186 - len(df.columns) = 186, 1 column per keyword
    for word in all_keywords:
        if word.isdigit():  # remove numbers
            all_keywords.remove(word)
        elif len(word) < 5:  # remove very short words
            all_keywords.remove(word)

    print(all_keywords)
    return all_keywords

Beispiel #19

0

Datei anzeigen

Datei: db_analyzer.py Projekt: fullstackenviormentss/playstore_graph_analysis

def extract_keywords(dump_path, packages):
    print("{0} Gathering descriptions and computing keywords...".format(
        datetime.datetime.now()))
    rake = Rake()
    keywords = {}
    for doc in get_descriptions(packages):
        try:
            if "translatedDescriptionHtml" in doc:
                html_description = unicode(
                    doc.get("translatedDescriptionHtml"))
            else:
                html_description = unicode(doc.get("descriptionHtml"))
            # remove html elements
            description = re.sub(r'<.*?>', '', html_description)
            # substitute non-ascii chars with stop words (e.g. dot)
            description = re.sub(r'[^\x00-\x7F]+', ' . ', description)
            rake.extract_keywords_from_text(description)
            ranking = rake.get_ranked_phrases_with_scores()
            for pair in ranking:
                keywords[pair[1]] = keywords.get(pair[1], 0) + pair[0]
        except AttributeError:
            continue
    sorted_keywords = sorted(keywords.items(), key=itemgetter(1))
    with open(dump_path, 'w') as outfile:
        json.dump(sorted_keywords, outfile, indent=2)

Beispiel #20

0

Datei anzeigen

Datei: graphs.py Projekt: CapitolZen/CapitolZen-Backend

    def _get_keyphrases(self):
        # Extract keywords and phrases from the current document so we know
        # what to search for in ES.
        r = Rake()
        r.extract_keywords_from_text(self.instance.content)
        key_phrases = [
            keyphrase[1] for keyphrase in r.get_ranked_phrases_with_scores()
            if keyphrase[0] >= self.min_rank_score
        ]
        if not key_phrases:
            key_phrases = [
                keyphrase[1] for keyphrase in
                r.get_ranked_phrases_with_scores()[:self.default_list_length]
            ]

        return key_phrases

Beispiel #21

0

Datei anzeigen

def extractKeywords(description, query):
    try:
        r = Rake()

        print("Extracting keywords from joined sequence with query: " + query)

        r.extract_keywords_from_text(description)

        print("Keywords extracted from text with query: " + query)

        keywords = r.get_ranked_phrases_with_scores()

        print("Extraction finished for query: " + query + " writing to file")

        df = pd.DataFrame(columns = ['rank', 'keyword_set'])

        for pair in keywords:
            num = (len(df) + 1)
            df.loc[num] = pair

        dirtitle =  query + '.csv'
        if not os.path.exists(totalKeywordDirectory):
            os.mkdir(totalKeywordDirectory)

        filenamelocation = os.path.join(totalKeywordDirectory, dirtitle)

        df.to_csv(filenamelocation, encoding='utf-8')

        print("File created for query: " + query)

    except Exception as e:
        print(e)

Beispiel #22

0

Datei anzeigen

def Method4(n=5):
    M4_input = input_text
    r = Rake()
    r.extract_keywords_from_text(M4_input)
    r.get_ranked_phrases()
    for key in r.get_ranked_phrases_with_scores()[:n]:
        print(removePunctuation(key[1].strip()))

Beispiel #23

0

Datei anzeigen

def simple_rake(text):
    rakeExtract = Rake(min_length=1, max_length=4)
    rakeExtract.extract_keywords_from_text(text)
    result = rakeExtract.get_ranked_phrases_with_scores()
    top_words = []
    for (score, phrase) in result:
        top_words.append(phrase)
    return result

Beispiel #24

0

Datei anzeigen

Datei: Summarizer.py Projekt: lzontar/Text_Adaptation_To_Context

def keywords_rake(text):
    r = Rake(stopwords=stopwords.words('english'),
             ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
             max_length=1)

    r.extract_keywords_from_text(text)
    keywords = r.get_ranked_phrases_with_scores()
    print(keywords)

Beispiel #25

0

Datei anzeigen

Datei: img_to_keywords.py Projekt: Axotronics/mughup

def memorize():
    r = Rake()
    filename = 'testocr.png'
    img = np.array(Image.open(filename))
    text = pytesseract.image_to_string(img)
    print(text)
    print(r.extract_keywords_from_text(text), r.get_ranked_phrases(),
          r.get_ranked_phrases_with_scores())

Beispiel #26

0

Datei anzeigen

Datei: spacy_v2.py Projekt: benjictln/TextMiningProject

def return_best_sentences_rake_nltk(text, threshold):
    r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
    r.extract_keywords_from_text(text)
    ranked_words = r.get_ranked_phrases_with_scores()
    for i in range(len(ranked_words)):
        if ranked_words[i][0] < threshold:
            return ranked_words[:i]
    return ranked_words

Beispiel #27

0

Datei anzeigen

def extract_phrases(text):
    """
    Calls the RAKE API to extract the relevant phrases of the given text
    :param text: String, the text to be analyzed
    :return: Array containing the phrases and their scores
    """
    extractor = Rake()
    extractor.extract_keywords_from_text(text)
    return extractor.get_ranked_phrases_with_scores()

Beispiel #28

0

Datei anzeigen

Datei: nlp_mod.py Projekt: Sbboss/Amazon-User-Review-Analysis-with-UI

def keyword(strg):
    '''Keyword and Concept extraction'''
    if strg:
        strg = linkrem(strg)
        r = Rake()
        a = r.extract_keywords_from_text(strg)

        return r.get_ranked_phrases()[:10], r.get_ranked_phrases_with_scores(
        )[:10]

Beispiel #29

0

Datei anzeigen

 def extractKeywords(self, textDescription):
     """Get the keyword phrases from the descriptions using NLP"""
     r = Rake()
     r.extract_keywords_from_text(textDescription)
     results = r.get_ranked_phrases(
     )  # To get all keyword phrases ranked highest to lowest.
     result_scores = r.get_ranked_phrases_with_scores()
     print(results)
     return results

Beispiel #30

0

Datei anzeigen

Datei: Scraping.py Projekt: zinebaabbad/DataEngChallenge

class ArticleScarping(Scraping):
    def __init__(self, url: str, authorHtmlTag: dict, dateHtmlTag: dict,
                 articleHtmlTag: dict):
        Scraping.__init__(self, url)
        #find hmtl sections with the given tag
        self.authorHtmlTag = authorHtmlTag
        self.dateHtmlTag = dateHtmlTag
        self.articleHtmlTag = articleHtmlTag
        # init keywords extractor

    def parseArticle(self):

        return self.parseHtmlTag(self.parsedHtml,
                                 self.articleHtmlTag).get_text()

    def parseAuthor(self):
        return self.parseHtmlTag(self.parsedHtml,
                                 self.authorHtmlTag).get_text()

    def parseDate(self):
        try:
            return str(self.parsedHtml.time['datetime'])
        except:
            return self.parseHtmlTag(self.parsedHtml,
                                     self.dateHtmlTag).get_text()

    def parseText(self):
        return self.parseHtmlTag(self.parsedHtml,
                                 self.articleHtmlTag).get_text()

    def getKeyWords(self):
        '''
        20 a word
        :return:
        '''
        self.rake = Rake(max_length=30)
        self.rake.extract_keywords_from_text(self.parseText())
        result = self.rake.get_ranked_phrases_with_scores()
        resultlist = [elem[1] for elem in result[:10]]
        return resultlist

    def parseTitle(self):
        title = "nan"
        if (self.parsedHtml.title is not None):
            title = self.parsedHtml.title.string
        return title

    def getArticleJson(self):
        json = {
            "title": self.parseTitle(),
            "date": self.parseDate(),
            "author": self.parseAuthor(),
            "article": self.parseText(),
            "keywords": self.getKeyWords(),
            "url": self.url
        }
        return json

Beispiel #31

0

Datei anzeigen

Datei: l2_rake.py Projekt: coder352/shellscript

#!/usr/bin/python3
# coding: utf-8
# pip install rake-nltk
from rake_nltk import Rake
from nltk import tokenize
r = Rake()  # Uses stopwords for english from NLTK, and all puntuation characters by default
##################################################################
## Extraction given the text.
mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.
            Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given.
            These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.'''
r.extract_keywords_from_text(mytext)
print(r.get_ranked_phrases())  # To get keyword phrases ranked highest to lowest.
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())  # To get keyword phrases ranked highest to lowest with scores.
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
##################################################################
## Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext))
print(r.get_ranked_phrases())
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]