def probability_word(word, text): prob = text.count(word) / len(text) return prob '''test if run as application''' if __name__ == '__main__': fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm' text_string = get_text_string(fname) raw_tokens = get_raw_tokens(text_string) tokens = get_clean_tokens(raw_tokens) #tokens of letters, with stopwords tokens_without_stopwords = delete_stopwords( 'C:\\Users\\navi_\\Dropbox\\NLP\\stopwords_es.txt', tokens) #writeList(tokens_without_stopwords, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_tokens.txt') vocabulary = get_vocabulary( tokens_without_stopwords) #vocabulary of unique tokens, with stopwords writeList( vocabulary, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt') #contextDict=retrieve_contexts(tokens_without_stopwords, vocabulary, 8) #writeDict(contextDict, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_contexts.txt') ''' sum=0 for v in vocabulary: prob = probability_word(v,tokens_without_stopwords) print(v, " = ", prob) sum=sum+prob print("prob=",sum) '''
elif lemma == None and lemmas_text.count(lemma) == 0: lemmas_text.append(v) return lemmas_text def gen_lemmas(archivo): f = open(archivo) t = f.readline() d = {} while t != "": l = t.split() if l != []: l[0] = l[0].replace("#", "") #g.write("%s %s\n" %(l[0],l[-1])) d.setdefault(l[0], l[-1]) t = f.readline() f.close() return d if __name__ == '__main__': fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt' f_vocabulary = open(fname_vocabulary, encoding='utf-8') voc = f_vocabulary.read() vocabulary = voc.split() f_vocabulary.close() fname_lemmas = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\generate.txt' lemmas_text_dict = gen_lemmas(fname_lemmas) lemmas_text_list = lemmas_text(lemmas_text_dict, vocabulary) writeList(lemmas_text_list, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_lemmas.txt')
v_squared = v**2 v_sum = v_squared.sum() v_length = math.sqrt(v_sum) lengths_product = vc_length * v_length similar_words_dict[key] = np.dot(v_to_compare, v) / lengths_product i += 1 print('cosine_similarity function ', str(i), str(similar_words_dict[key])) similar_words = sorted(similar_words_dict.items(), key=operator.itemgetter(1), reverse=True) return similar_words '''test if run as application''' if __name__ == '__main__': fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt' fname_contexts = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_contexts.txt' raw_freq_vectors_dict = raw_freq_vectors(fname_vocabulary, fname_contexts) freq_vectors_dict = freq_vectors(raw_freq_vectors_dict) word = 'empresa' similar_words = cosine_similarity(raw_freq_vectors_dict, word) writeList( similar_words, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\' + word + '_similar_words_without_stopwords.txt') similar_words2 = cosine_similarity(freq_vectors_dict, word) writeList( similar_words2, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\' + word + '_similar_words2_without_stopwords.txt')
elif words[-2][0] == 'I': if '#' in words[0]: words[0] = words[0].replace('#', '') #eliminate the # symbol ilist.append(words[0] + ' ' + words[-2] + ' ' + words[-1]) elif words[-2][0] == 'F': if '#' in words[0]: words[0] = words[0].replace('#', '') #eliminate the # symbol flist.append(words[0] + ' ' + words[-2] + ' ' + words[-1]) return { 'a': alist, 'r': rlist, 'n': nlist, 'v': vlist, 'p': plist, 'dt': dtlist, 's': slist, 'c': clist, 'i': ilist, 'f': flist } '''test if run as application''' if __name__ == '__main__': diccionario = divideIntoPOS('generate.txt') for key in diccionario: writeList(diccionario[key], key + 'list.txt')
condEnt = {} for w in vocabulary: pw2 = prob_word_in_sentences(w, sentences) pw1w2 = prob_conj('empresa', w, sentences) entropy = cond_entropy(pw1, pw2, pw1w2) if entropy: condEnt[w] = entropy return sorted(condEnt.items(), key=operator.itemgetter(1)) if __name__ == '__main__': """obteniendo oraciones del texto""" fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm' text_string = get_text_string(fname) sentences = getSentences(text_string) print('No de oraciones: ', len(sentences)) """obteniendo el vocabulario""" fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt' f_vocabulary = open(fname_vocabulary, encoding='utf-8') voc = f_vocabulary.read() vocabulary = voc.split() f_vocabulary.close() """obteniendo la entropia condicional de empresa con las palabras del vocabulario""" condEnt = cond_entropy_of_text('empresa', sentences, vocabulary) writeList( condEnt, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\empresa_condEnt.txt')
pw1 = smooth_prob_word_in_sentences(word, sentences) mutInfo = {} for w in vocabulary: pw2 = smooth_prob_word_in_sentences(w, sentences) pw1w2 = smooth_prob_conj('empresa', w, sentences) mi = mutual_information(pw1, pw2, pw1w2) mutInfo[w] = mi return sorted(mutInfo.items(), key=operator.itemgetter(1), reverse=True) if __name__ == '__main__': """obteniendo el texto para tokenizar por oraciones""" fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm' text_string = get_text_string(fname) sentences = getSentences(text_string) #print('No de oraciones: ',len(sentences)) """obteniendo el vocabulario""" fname_vocabulary = 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_vocabulary.txt' f_vocabulary = open(fname_vocabulary, encoding='utf-8') voc = f_vocabulary.read() vocabulary = voc.split() f_vocabulary.close() """obteniendo la informacion mutua entre empresa y las palabras del vocabulario""" mutInfo = mutual_information_of_text('empresa', sentences, vocabulary) writeList( mutInfo, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\empresa_mutual_information.txt' )
for j in range(i-int(windowSize/2), i): #left context if j >= 0: context.append(text[j]) try: for j in range(i+1, i+(int(windowSize/2)+1)): #right context context.append(text[j]) except IndexError: pass contextDict[w]=context return contextDict '''test if run as application''' if __name__=='__main__': fname='e960401.htm' text_string=get_text_string(fname) #print text_string raw_tokens=get_raw_tokens(text_string) #print raw_tokens tokens=get_clean_tokens(raw_tokens) #tokens of letters, with stopwords print tokens tokens_without_stopwords=delete_stopwords('stopwords_es.txt', tokens) writeList(tokens_without_stopwords, 'e960401_tokens.txt') vocabulary=get_vocabulary(tokens_without_stopwords) #vocabulary of unique tokens, with stopwords writeList(vocabulary, 'e960401_vocabulary.txt') contextDict=retrieve_contexts(tokens_without_stopwords, vocabulary, 8) writeDict(contextDict, 'e960401_contexts.txt')
n1 = np.sqrt(v1 @ v1) n2 = np.sqrt(v2 @ v2) try: res = prod / (n1 * n2) except ZeroDivisionError: res = "nan" return res if __name__ == '__main__': fname = 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm' text_string = get_text_string(fname) raw_tokens = get_raw_tokens(text_string) clean_tokens = clean_tokens(raw_tokens) writeList( clean_tokens, 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_clean_tokens.txt') difference = compare_lists(raw_tokens, clean_tokens) writeList( sorted(difference), 'C:\\Users\\navi_\\Dropbox\\NLP\\Programas\\e960401_difference.txt') """bag_e = context_word(clean_tokens, 'empresa', 8) bag_c = context_word(clean_tokens, 'compañía', 8) bag_a = context_word(clean_tokens, 'agua', 8) voc=set(clean_tokens) vectore = np.array(vsm(bag_e, list(voc))) vectorc = np.array(vsm(bag_c, list(voc))) vectora = np.array(vsm(bag_a, list(voc)))
if (' '.join(words[:i + 1]).istitle() or ' '.join(words[:i + 1]).isupper()) and words[i].isalnum(): cap_let_word = ' '.join(words[:i + 1]) i += 1 else: break if len(words[i:]) < 2: return [cap_let_word] else: return [cap_let_word] + get_capital_letter_words(words[i + 1:]) if __name__ == '__main__': articles = split_into_articles( 'C:\\Users\\navi_\\Dropbox\\NLP\\Corpus\\e960401.htm') sentences = [] for a in articles: sents = getSentences(a) for s in sents: sentences.append(s) cl_words = [] for s in sentences: words = nltk.word_tokenize(s) cl_words = cl_words + get_capital_letter_words(words) cl_words = sorted(set(cl_words)) writeList(cl_words, 'name_entity.txt')