def main(query,job_title,identity_skills,company,location,logical_operators): stopwords = stopword() query = remove_special_chars(query.lower()) tokens = [correct(q) for q in query.split() if q not in stopwords] tokens = tokens + get_bigram(query,stopwords) + get_trigram(query,stopwords) t1= datetime.now() print cfg_lexical_rule(tokens,job_title,identity_skills,company,location,logical_operators) t2= datetime.now() print t2-t1
def spell_check(text): """ :param text: :return: """ token_text = tokenise(text) # print token_text for word in token_text: # print word correct_word = spellchecker.correct(word) # print correct_word text = text.replace(word, correct_word) # print text return text
def cfg_lexical_rule(tokens,job_title,identity_skills,company,location,logical_operators): #..Get part of speech for tokens created from text and create cfg. tag = {} part_of_speech = nltk.pos_tag(tokens) grammer_pattern = ",".join(i[1] for i in part_of_speech) year_rule = ['CD,NNS,CD,NNS','CD,-NONE-,CD','CD,CD','LS,NNS,CD,NNS'] salary_rule = ['CD,NNS,JJ','JJR,IN,CD,NNS,JJ','CD,NN,NN','CD,JJ'] experience_stack = {} for i in (part_of_speech): if len(i[0]) < 4: if 'CD' in i[1] or 'LS' in i[1]: tag[i[0]] = 'experience' else: pass print tokens for j in xrange(len(tokens)): if '/' in tokens[j] and ' ' not in tokens[j]: elem = tokens[j].split("/") for e in elem: e= correct(e) if e in identity_skills: tag[e] = 'skill' if e in location and tokens[j-1] in logical_operators: tag[tokens[j-1]+":"+e] = 'location' if e in location and tokens[j-1] not in logical_operators: tag[e] = 'location' if e in company: tag[e] = 'company' if e in identity_skills and tokens[j+1] in job_title: tag[e + " " + tokens[j+1]] = 'job_title' if e in job_title: tag[e] = 'job_title' else: pass elif tokens[j] in identity_skills and (tokens[j] not in company and tokens[j] not in location and tokens[j] not in job_title): tag[tokens[j]] = 'skill' elif tokens[j] in company and (tokens[j] not in identity_skills and tokens[j] not in location and tokens[j] not in job_title): tag[tokens[j]] = 'company' elif tokens[j] in location and tokens[j] not in identity_skills: tag[tokens[j]] = 'location' elif tokens[j] in job_title: tag[tokens[j]] = 'job_title' else: pass return tag
def correct_words(string): corrected = "" for word in string.split(): corrected += spellchecker.correct(word) corrected += ' ' return corrected.strip(' ')
c = 0 comentario = '' for frase in cluster: comentario = comentario + frase[2] + " " topicos.append(comentario) count = 0 #Juntando Cluster que possuem mesmo tema print("=========== Resumo ================") temas = [] clusters = [] for frase in topicos: try: tema = spellchecker.correct( extrator.extrair( summarizer.summarize(frase, words=20, language='portuguese'))) if tema not in temas: temas.append(tema) clusters.append(frase) print("Novo Tema: " + tema) else: print("Repetiu: " + tema) ind = temas.index(tema) cluster = clusters[ind] #print(cluster) clusters[ind] = cluster + " " + frase except: print("Sem Tema") print("Temas") print(temas)
def countpositivenegtivetestdata(): global test i = 0 while i < len(test): str = test[i].split(' ') j = 0 text = nltk.word_tokenize(test[i]) tk = nltk.pos_tag(text) sp1 = 0 sp2 = 0 while j < len(str): str[j] = spellchecker.correct(str[j], model) if str[j] in positive: if tk[j][1] in special: sp1 += 1 atest[i][0] += 1 if str[j] in negative: if tk[j][1] in special: sp2 += 1 atest[i][1] += 1 if str[j] in negation: # print str[j] atest[i][2] += 1 if '#' in str[j]: #print str[j] x = twittersegment.tweet(str[j][1:]) k = 0 while k < len(x): if x[k] == '': k += 1 continue else: if x[k] in positive: atest[i][3] += 1 if x[k] in negative: atest[i][4] += 1 k += 1 # print x # print a[i][3],a[i][4] j += 1 if sp1 > 0 and sp2 > 0: atest[i][5] = 0 atest[i][6] = 0 else: if sp1 > 0: atest[i][5] = 1 atest[i][6] = 1 else: if sp2 > 0: atest[i][5] = -1 atest[i][6] = 1 else: atest[i][5] = 0 atest[i][6] = 0 # print train[i] # print atest[i][0], # print atest[i][1], # print atest[i][2], atest[i][3], atest[i][4],atest[i][5],atest[i][6] i += 1