Beispiel #1
0

def probability_word(word, text):
    prob = text.count(word) / len(text)
    return prob


'''test if run as application'''
if __name__ == '__main__':
    fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm'
    text_string = get_text_string(fname)
    raw_tokens = get_raw_tokens(text_string)
    tokens = get_clean_tokens(raw_tokens)  #tokens of letters, with stopwords
    tokens_without_stopwords = delete_stopwords(
        'C:\\Users\\navi_\\Dropbox\\NLP\\stopwords_es.txt', tokens)
    #writeList(tokens_without_stopwords, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_tokens.txt')
    vocabulary = get_vocabulary(
        tokens_without_stopwords)  #vocabulary of unique tokens, with stopwords
    writeList(
        vocabulary,
        'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt')
    #contextDict=retrieve_contexts(tokens_without_stopwords, vocabulary, 8)
    #writeDict(contextDict, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_contexts.txt')
    '''
    sum=0
    for v in vocabulary:
        prob = probability_word(v,tokens_without_stopwords)
        print(v, " = ", prob)
        sum=sum+prob
    print("prob=",sum)
    '''
Beispiel #2
0
        elif lemma == None and lemmas_text.count(lemma) == 0:
            lemmas_text.append(v)
    return lemmas_text


def gen_lemmas(archivo):
    f = open(archivo)
    t = f.readline()
    d = {}
    while t != "":
        l = t.split()
        if l != []:
            l[0] = l[0].replace("#", "")
            #g.write("%s %s\n" %(l[0],l[-1]))
            d.setdefault(l[0], l[-1])
        t = f.readline()
    f.close()
    return d


if __name__ == '__main__':
    fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt'
    f_vocabulary = open(fname_vocabulary, encoding='utf-8')
    voc = f_vocabulary.read()
    vocabulary = voc.split()
    f_vocabulary.close()
    fname_lemmas = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\generate.txt'
    lemmas_text_dict = gen_lemmas(fname_lemmas)
    lemmas_text_list = lemmas_text(lemmas_text_dict, vocabulary)
    writeList(lemmas_text_list,
              'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_lemmas.txt')
Beispiel #3
0
        v_squared = v**2
        v_sum = v_squared.sum()
        v_length = math.sqrt(v_sum)
        lengths_product = vc_length * v_length

        similar_words_dict[key] = np.dot(v_to_compare, v) / lengths_product
        i += 1
        print('cosine_similarity function ', str(i),
              str(similar_words_dict[key]))

    similar_words = sorted(similar_words_dict.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
    return similar_words


'''test if run as application'''
if __name__ == '__main__':
    fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt'
    fname_contexts = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_contexts.txt'
    raw_freq_vectors_dict = raw_freq_vectors(fname_vocabulary, fname_contexts)
    freq_vectors_dict = freq_vectors(raw_freq_vectors_dict)
    word = 'empresa'
    similar_words = cosine_similarity(raw_freq_vectors_dict, word)
    writeList(
        similar_words, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\' + word +
        '_similar_words_without_stopwords.txt')
    similar_words2 = cosine_similarity(freq_vectors_dict, word)
    writeList(
        similar_words2, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\' + word +
        '_similar_words2_without_stopwords.txt')
            elif words[-2][0] == 'I':
                if '#' in words[0]:
                    words[0] = words[0].replace('#',
                                                '')  #eliminate the # symbol
                ilist.append(words[0] + ' ' + words[-2] + ' ' + words[-1])
            elif words[-2][0] == 'F':
                if '#' in words[0]:
                    words[0] = words[0].replace('#',
                                                '')  #eliminate the # symbol
                flist.append(words[0] + ' ' + words[-2] + ' ' + words[-1])

    return {
        'a': alist,
        'r': rlist,
        'n': nlist,
        'v': vlist,
        'p': plist,
        'dt': dtlist,
        's': slist,
        'c': clist,
        'i': ilist,
        'f': flist
    }


'''test if run as application'''
if __name__ == '__main__':
    diccionario = divideIntoPOS('generate.txt')
    for key in diccionario:
        writeList(diccionario[key], key + 'list.txt')
Beispiel #5
0
    condEnt = {}

    for w in vocabulary:
        pw2 = prob_word_in_sentences(w, sentences)
        pw1w2 = prob_conj('empresa', w, sentences)
        entropy = cond_entropy(pw1, pw2, pw1w2)
        if entropy:
            condEnt[w] = entropy

    return sorted(condEnt.items(), key=operator.itemgetter(1))


if __name__ == '__main__':
    """obteniendo oraciones del texto"""
    fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm'
    text_string = get_text_string(fname)
    sentences = getSentences(text_string)
    print('No de oraciones: ', len(sentences))
    """obteniendo el vocabulario"""
    fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt'
    f_vocabulary = open(fname_vocabulary, encoding='utf-8')
    voc = f_vocabulary.read()
    vocabulary = voc.split()
    f_vocabulary.close()
    """obteniendo la entropia condicional de empresa con las palabras del vocabulario"""
    condEnt = cond_entropy_of_text('empresa', sentences, vocabulary)
    writeList(
        condEnt,
        'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\empresa_condEnt.txt')
Beispiel #6
0
    pw1 = smooth_prob_word_in_sentences(word, sentences)

    mutInfo = {}

    for w in vocabulary:
        pw2 = smooth_prob_word_in_sentences(w, sentences)
        pw1w2 = smooth_prob_conj('empresa', w, sentences)
        mi = mutual_information(pw1, pw2, pw1w2)
        mutInfo[w] = mi

    return sorted(mutInfo.items(), key=operator.itemgetter(1), reverse=True)


if __name__ == '__main__':
    """obteniendo el texto para tokenizar por oraciones"""
    fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm'
    text_string = get_text_string(fname)
    sentences = getSentences(text_string)
    #print('No de oraciones: ',len(sentences))
    """obteniendo el vocabulario"""
    fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt'
    f_vocabulary = open(fname_vocabulary, encoding='utf-8')
    voc = f_vocabulary.read()
    vocabulary = voc.split()
    f_vocabulary.close()
    """obteniendo la informacion mutua entre empresa y las palabras del vocabulario"""
    mutInfo = mutual_information_of_text('empresa', sentences, vocabulary)
    writeList(
        mutInfo,
        'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\empresa_mutual_information.txt'
    )
Beispiel #7
0
                for j in range(i-int(windowSize/2), i): #left context
                    if j >= 0:
                        context.append(text[j])
                try:
                    for j in range(i+1, i+(int(windowSize/2)+1)): #right context
                        context.append(text[j])
                except IndexError:
                    pass
        contextDict[w]=context 

    return contextDict

'''test if run as application'''
if __name__=='__main__':
    fname='e960401.htm'
    text_string=get_text_string(fname)
    #print text_string
    raw_tokens=get_raw_tokens(text_string)
    #print raw_tokens
    tokens=get_clean_tokens(raw_tokens) #tokens of letters, with stopwords
    print tokens
    tokens_without_stopwords=delete_stopwords('stopwords_es.txt', tokens)
    writeList(tokens_without_stopwords, 'e960401_tokens.txt')
    
    vocabulary=get_vocabulary(tokens_without_stopwords)   #vocabulary of unique tokens, with stopwords 
    writeList(vocabulary, 'e960401_vocabulary.txt')    
   
    contextDict=retrieve_contexts(tokens_without_stopwords, vocabulary, 8)
    writeDict(contextDict, 'e960401_contexts.txt')
     
    
Beispiel #8
0
    n1 = np.sqrt(v1 @ v1)
    n2 = np.sqrt(v2 @ v2)
    try:
        res = prod / (n1 * n2)
    except ZeroDivisionError:
        res = "nan"
    return res


if __name__ == '__main__':
    fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm'
    text_string = get_text_string(fname)
    raw_tokens = get_raw_tokens(text_string)
    clean_tokens = clean_tokens(raw_tokens)
    writeList(
        clean_tokens,
        'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_clean_tokens.txt')

    difference = compare_lists(raw_tokens, clean_tokens)
    writeList(
        sorted(difference),
        'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_difference.txt')
    """bag_e =  context_word(clean_tokens, 'empresa', 8)
    bag_c =  context_word(clean_tokens, 'compañía', 8)
    bag_a =  context_word(clean_tokens, 'agua', 8)

    voc=set(clean_tokens)

    vectore = np.array(vsm(bag_e, list(voc)))
    vectorc = np.array(vsm(bag_c, list(voc)))
    vectora = np.array(vsm(bag_a, list(voc)))
Beispiel #9
0
        if (' '.join(words[:i + 1]).istitle()
                or ' '.join(words[:i + 1]).isupper()) and words[i].isalnum():
            cap_let_word = ' '.join(words[:i + 1])
            i += 1
        else:
            break
    if len(words[i:]) < 2:
        return [cap_let_word]
    else:
        return [cap_let_word] + get_capital_letter_words(words[i + 1:])


if __name__ == '__main__':
    articles = split_into_articles(
        'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm')

    sentences = []

    for a in articles:
        sents = getSentences(a)
        for s in sents:
            sentences.append(s)

    cl_words = []
    for s in sentences:
        words = nltk.word_tokenize(s)
        cl_words = cl_words + get_capital_letter_words(words)
    cl_words = sorted(set(cl_words))

    writeList(cl_words, 'name_entity.txt')