def main(query,job_title,identity_skills,company,location,logical_operators):
    stopwords = stopword()
    query = remove_special_chars(query.lower())
    tokens = [correct(q) for q in query.split() if q not in stopwords]
    tokens = tokens + get_bigram(query,stopwords) + get_trigram(query,stopwords)
    
    t1=  datetime.now()
    print cfg_lexical_rule(tokens,job_title,identity_skills,company,location,logical_operators)
    t2= datetime.now()
    print t2-t1
Exemple #2
0
def spell_check(text):
    """

    :param text:
    :return:
    """
    token_text = tokenise(text)
    # print token_text
    for word in token_text:
        # print word
        correct_word = spellchecker.correct(word)
        # print correct_word
        text = text.replace(word, correct_word)
    # print text
    return text
def cfg_lexical_rule(tokens,job_title,identity_skills,company,location,logical_operators):
    #..Get part of speech for tokens created from text and create cfg.
    tag = {}
    part_of_speech = nltk.pos_tag(tokens)
    grammer_pattern = ",".join(i[1] for i in part_of_speech)
    year_rule = ['CD,NNS,CD,NNS','CD,-NONE-,CD','CD,CD','LS,NNS,CD,NNS']
    salary_rule = ['CD,NNS,JJ','JJR,IN,CD,NNS,JJ','CD,NN,NN','CD,JJ']
    experience_stack = {}
    for i in (part_of_speech):
        if len(i[0]) < 4:
            if 'CD' in i[1] or 'LS' in i[1]:
                tag[i[0]] = 'experience'
            else:
                pass
    print tokens
    for j in xrange(len(tokens)):
	if '/' in tokens[j] and ' ' not in tokens[j]:
            elem = tokens[j].split("/")
            for e in elem:
		e= correct(e)
                if e in identity_skills:
                    tag[e] = 'skill'
                if e in location and tokens[j-1] in logical_operators:
                    tag[tokens[j-1]+":"+e] = 'location'
		if e in location and tokens[j-1] not in logical_operators:
                    tag[e] = 'location'
                if e in company:
                    tag[e] = 'company'
                if e in identity_skills and tokens[j+1] in job_title:
                    tag[e + " " + tokens[j+1]] = 'job_title'
                if e in job_title:
                    tag[e] =  'job_title'
                else:
                    pass

        elif tokens[j] in identity_skills and (tokens[j] not in company and tokens[j] not in location and tokens[j] not in job_title):
            tag[tokens[j]] = 'skill'
        elif tokens[j] in company and (tokens[j] not in identity_skills and tokens[j] not in location and tokens[j] not in job_title):
            tag[tokens[j]] = 'company'
        elif tokens[j] in location and tokens[j] not in identity_skills:
	    tag[tokens[j]] = 'location'
	elif tokens[j] in job_title:
	    tag[tokens[j]] = 'job_title'
	else:
	    pass
    return tag
Exemple #4
0
def correct_words(string):
	corrected = ""
	for word in string.split():
		corrected += spellchecker.correct(word)
		corrected += ' '
	return corrected.strip(' ')
Exemple #5
0
    c = 0
    comentario = ''
    for frase in cluster:
        comentario = comentario + frase[2] + " "

    topicos.append(comentario)
count = 0

#Juntando Cluster que possuem mesmo tema
print("=========== Resumo ================")
temas = []
clusters = []
for frase in topicos:
    try:
        tema = spellchecker.correct(
            extrator.extrair(
                summarizer.summarize(frase, words=20, language='portuguese')))
        if tema not in temas:
            temas.append(tema)
            clusters.append(frase)
            print("Novo Tema: " + tema)
        else:
            print("Repetiu: " + tema)
            ind = temas.index(tema)
            cluster = clusters[ind]
            #print(cluster)
            clusters[ind] = cluster + " " + frase
    except:
        print("Sem Tema")
print("Temas")
print(temas)
Exemple #6
0
def countpositivenegtivetestdata():
    global test
    i = 0
    while i < len(test):
        str = test[i].split(' ')

        j = 0
        text = nltk.word_tokenize(test[i])
        tk = nltk.pos_tag(text)

        sp1 = 0
        sp2 = 0

        while j < len(str):
            str[j] = spellchecker.correct(str[j], model)

            if str[j] in positive:
                if tk[j][1] in special:
                    sp1 += 1

                atest[i][0] += 1
            if str[j] in negative:
                if tk[j][1] in special:
                    sp2 += 1
                atest[i][1] += 1
            if str[j] in negation:
                # print str[j]
                atest[i][2] += 1

            if '#' in str[j]:
                #print str[j]
                x = twittersegment.tweet(str[j][1:])
                k = 0
                while k < len(x):
                    if x[k] == '':
                        k += 1
                        continue
                    else:
                        if x[k] in positive:
                            atest[i][3] += 1
                        if x[k] in negative:
                            atest[i][4] += 1

                    k += 1
                    # print x
                    # print a[i][3],a[i][4]

            j += 1
        if sp1 > 0 and sp2 > 0:
            atest[i][5] = 0
            atest[i][6] = 0
        else:
            if sp1 > 0:
                atest[i][5] = 1
                atest[i][6] = 1
            else:
                if sp2 > 0:
                    atest[i][5] = -1
                    atest[i][6] = 1
                else:

                    atest[i][5] = 0
                    atest[i][6] = 0

        # print train[i]
        # print atest[i][0],
        # print atest[i][1],
        # print atest[i][2], atest[i][3], atest[i][4],atest[i][5],atest[i][6]
        i += 1