Example #1
0
def extract_keywords(text, stopwords_pattern):
  """
  Calls the RAKE module (from github.com/aneesha/RAKE/rake.py
  with very minor modifications) on the full description to
  create less noisy text for vectorization.
  """
  sentences = rk.splitSentences(text)
  phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern)
  word_scores = rk.calculateWordScores(phrase_list)
  keyword_candidates = rk.generateCandidateKeywordScores(
    phrase_list, word_scores)
  sorted_keywords = sorted(keyword_candidates.iteritems(),
    key=operator.itemgetter(1), reverse=True)
  n_keywords = len(sorted_keywords)
  return " ".join(map(lambda x: x[0],
    sorted_keywords[0:int(n_keywords / 3)]))
Example #2
0
def extract_keywords(text, stopwords_pattern):
    """
  Calls the RAKE module (from github.com/aneesha/RAKE/rake.py
  with very minor modifications) on the full description to
  create less noisy text for vectorization.
  """
    sentences = rk.splitSentences(text)
    phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern)
    word_scores = rk.calculateWordScores(phrase_list)
    keyword_candidates = rk.generateCandidateKeywordScores(
        phrase_list, word_scores)
    sorted_keywords = sorted(keyword_candidates.iteritems(),
                             key=operator.itemgetter(1),
                             reverse=True)
    n_keywords = len(sorted_keywords)
    return " ".join(map(lambda x: x[0],
                        sorted_keywords[0:int(n_keywords / 3)]))
Example #3
0
def extract(text):
    # Extracts keywords from text
    
    # preprocess, tokenize, group in n-grams
    text = util.textprocess.preprocess(text)
    sentences = [separatewords(sentence, 0) for sentence in splitSentences(text)]    
    phrases = [ngram for sentence in sentences for n in range(3) for ngram in generate_ngrams(sentence, n + 1)]    
    phrases = prefilter(phrases)
    
    # casings
    casings = {}
    for phrase in phrases:
        casings[' '.join(phrase)] = casing.normalize(phrase)
    
    # RAKE
    wordscores = calculateWordScores(phrases)        
    keywords = generateCandidateKeywordScores(phrases, wordscores)
    
    # Factor in IDF
    for keyphrase in keywords.keys():
        idfScore = idf.get(keyphrase)
        if idfScore == 0:
            del keywords[keyphrase]
            continue
        keywords[keyphrase] /= idfScore
                
    # Post filter    
    filtered = postfilter(keywords.keys())
    for keyphrase in keywords.keys():
        if keyphrase not in filtered:
            del keywords[keyphrase]    
    
    # Normalize scores
    if len(filtered) > 0:        
        maxWeight = max([keywords[keyphrase] for keyphrase in filtered])
        if maxWeight > 0:
            for keyphrase in keywords.keys():
                keywords[keyphrase] /= maxWeight
        
    # format
    keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
    return [{"keyword":casings.setdefault(pair[0], pair[0]), "weight":pair[1]} for pair in keywords]