def performTask(rawtext): text=textprocess.preprocess(rawtext) POS_text = pt.tag(text) print POS_text #Didn't do toLower earlier because the stanford tagger might make # use of the capitalizations text = text.lower() skilldict = buildskilldict(skills) naive_skills= generate_naive(text,skilldict) expanded_skills = new_guesses(POS_text,naive_skills,words,clusters) return naive_skills, expanded_skills
def crackr(text, skillfilter=None): # skillfilter can be one of # - None: don't filter # - "pre": filter a priori # - "post": filter a posteriori # preprocess text text = textprocess.preprocess(text).lower() # tokenize sentenceList = splitSentences(text) phraseList = generateCandidateKeywords(sentenceList, stopwordpattern) wordscores = calculateWordScores(phraseList) # generate ngrams tokens = text.split() # pre-filter if skillfilter == 'pre': tokens = [token for token in tokens if token in skilldict] ngrams = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)] # filter clusters viableclusters = [] for ngram in ngrams: try: viableclusters += [words[ngram] [1]] except: pass # filter words by clusters viablewords = [] for cluster in set(viableclusters): for (word, _) in clusters[cluster]: viablewords += [word] viablewords = set(viablewords) # pos tag pos_tagged = pt.tag(text) # filter single words index = 0 finallst = [] indices = [] thirdlst = [] for tup in pos_tagged: word, tag = tup if tag == 'CC': print word print pos_tagged[index - 1] [1] [0], pos_tagged[index - 1] [1] [0] if tag[0] == 'N' or tag[0] == 'J' or (tag == 'CC' and pos_tagged[index - 1] [1] [0] == 'N'): if word.lower() in viablewords: finallst += [(word)] indices += [index] thirdlst += [(word, index)] index += 1 # generate keyword phrases ngrams = stich(finallst, indices) ngrams = [" ".join(ngram) for ngram in ngrams] keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores) scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) # post-filter if skillfilter == 'post': scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict] # format return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]