Example #1
0
def extract_keywords(text, stopwords_pattern):
  """
  Calls the RAKE module (from github.com/aneesha/RAKE/rake.py
  with very minor modifications) on the full description to
  create less noisy text for vectorization.
  """
  sentences = rk.splitSentences(text)
  phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern)
  word_scores = rk.calculateWordScores(phrase_list)
  keyword_candidates = rk.generateCandidateKeywordScores(
    phrase_list, word_scores)
  sorted_keywords = sorted(keyword_candidates.iteritems(),
    key=operator.itemgetter(1), reverse=True)
  n_keywords = len(sorted_keywords)
  return " ".join(map(lambda x: x[0],
    sorted_keywords[0:int(n_keywords / 3)]))
Example #2
0
def extract_keywords(text, stopwords_pattern):
    """
  Calls the RAKE module (from github.com/aneesha/RAKE/rake.py
  with very minor modifications) on the full description to
  create less noisy text for vectorization.
  """
    sentences = rk.splitSentences(text)
    phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern)
    word_scores = rk.calculateWordScores(phrase_list)
    keyword_candidates = rk.generateCandidateKeywordScores(
        phrase_list, word_scores)
    sorted_keywords = sorted(keyword_candidates.iteritems(),
                             key=operator.itemgetter(1),
                             reverse=True)
    n_keywords = len(sorted_keywords)
    return " ".join(map(lambda x: x[0],
                        sorted_keywords[0:int(n_keywords / 3)]))
Example #3
0
def rake(text, skillfilter=None):
    # preprocess text
    text = textprocess.preprocess(text)    
    
    # tokenize
    sentenceList = splitSentences(text)
    phraseList = generateCandidateKeywords(sentenceList, stopwordpattern)
    
    # generate candidates and calculate scores
    wordscores = calculateWordScores(phraseList)
    keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores)    
    scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    # pre/post-filter
    if skillfilter != None:
        scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict]
    
    # format
    return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]
Example #4
0
def extract(text):
    # Extracts keywords from text
    
    # preprocess, tokenize, group in n-grams
    text = util.textprocess.preprocess(text)
    sentences = [separatewords(sentence, 0) for sentence in splitSentences(text)]    
    phrases = [ngram for sentence in sentences for n in range(3) for ngram in generate_ngrams(sentence, n + 1)]    
    phrases = prefilter(phrases)
    
    # casings
    casings = {}
    for phrase in phrases:
        casings[' '.join(phrase)] = casing.normalize(phrase)
    
    # RAKE
    wordscores = calculateWordScores(phrases)        
    keywords = generateCandidateKeywordScores(phrases, wordscores)
    
    # Factor in IDF
    for keyphrase in keywords.keys():
        idfScore = idf.get(keyphrase)
        if idfScore == 0:
            del keywords[keyphrase]
            continue
        keywords[keyphrase] /= idfScore
                
    # Post filter    
    filtered = postfilter(keywords.keys())
    for keyphrase in keywords.keys():
        if keyphrase not in filtered:
            del keywords[keyphrase]    
    
    # Normalize scores
    if len(filtered) > 0:        
        maxWeight = max([keywords[keyphrase] for keyphrase in filtered])
        if maxWeight > 0:
            for keyphrase in keywords.keys():
                keywords[keyphrase] /= maxWeight
        
    # format
    keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
    return [{"keyword":casings.setdefault(pair[0], pair[0]), "weight":pair[1]} for pair in keywords]
Example #5
0
def naive(text, skillfilter=None, jointfilter=True):
    # preprocess
    text = textprocess.preprocess(text)
    
    # generate word scores
    wordscores = calculateWordScores(text)
    
    # tokenize    
    tokens = text.split()
    
    # prefilter    
    if skillfilter == 'pre':
        tokens = [token for token in tokens if token in skilldict]        
    phraseList = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)]
    keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores)  
    scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    # post-filter
    if skillfilter == 'post':
        scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict]
    
    # format
    return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams] 
Example #6
0
def crackr(text, skillfilter=None):
    # skillfilter can be one of
    # - None: don't filter
    # - "pre": filter a priori
    # - "post": filter a posteriori

    # preprocess text
    text = textprocess.preprocess(text).lower()
    
    # tokenize
    sentenceList = splitSentences(text)
    phraseList = generateCandidateKeywords(sentenceList, stopwordpattern)
    wordscores = calculateWordScores(phraseList)
    
    # generate ngrams    
    tokens = text.split()
    
    # pre-filter
    if skillfilter == 'pre':
        tokens = [token for token in tokens if token in skilldict]        
    ngrams = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)]    
    
    # filter clusters
    viableclusters = []
    for ngram in ngrams:
        try:
            viableclusters += [words[ngram] [1]]
        except:
            pass
            
    # filter words by clusters
    viablewords = []
    for cluster in set(viableclusters):
        for (word, _) in clusters[cluster]:
            viablewords += [word]
    viablewords = set(viablewords)
    
    # pos tag
    pos_tagged = pt.tag(text)
    
    # filter single words
    index = 0
    finallst = []        
    indices = []
    thirdlst = []
    for tup in pos_tagged:
        word, tag = tup
        if tag == 'CC':
            print word
            print pos_tagged[index - 1] [1] [0], pos_tagged[index - 1] [1] [0] 
        if tag[0] == 'N' or tag[0] == 'J' or (tag == 'CC' and pos_tagged[index - 1] [1] [0] == 'N'):
            if word.lower() in viablewords:
                finallst += [(word)]
                indices += [index]
                thirdlst += [(word, index)]
        index += 1

    # generate keyword phrases
    ngrams = stich(finallst, indices)
    ngrams = [" ".join(ngram) for ngram in ngrams]
    keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores)  
    scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    # post-filter
    if skillfilter == 'post':
        scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict]
    
    # format
    return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]