def getData(Name,Type,Level): #type if(Type=='text_file'): #readfile # file=open(Name) # line="" # prev="a" # while prev!="": # prev=file.readline() # line += prev + " " file=open(Name,'r') line=file.read() line=clean_lower(line,['lower']) #COnvert to Lower Case #level # print(line[:100]) if(Level=='line'): genrated= line; elif(Level=='char'): genrated= line.split(""); elif(Level=='word'): genrated=wt(line) elif(Level=='sentence_word'): sentences=st(line) genrated=[wt(s) for s in sentences] elif(Level=='sentence'): genrated=st(line) print("Total Sentences: ",len(genrated)) return genrated
def scoreline(line1,line2,metric,ic=None): sw = stopwords.words('english') # import stopwords t1 = wt(line1) # tokenize line1 t2 = wt(line2) # tokenize line2 syns1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in t1 if x not in sw]) # get list of synsets for tokens of line1 syns2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in t2 if x not in sw]) # get list of synsets for tokens of line2 runningscore = 0.0 runningcount = 0 print "syns1: ", syns1 print "syns2: ", syns2 for syn1 in set(syns1): # get Wordnet similarity score for <metric> for each pair created from both synset lists for syn2 in set(syns2): if ic is not None: try: mark = metric(syn1,syn2) except: mark = 0.0 runningscore += mark else: try: mark = metric(syn1,syn2) except: mark = 0.0 runningcount += 1 score = runningscore/runningcount # add up individual scores, divide by number of individual scores return score # return overall scores
def sim_overlap(sentence1, sentence2): # lowercase sentence1 = sentence1.lower() sentence2 = sentence2.lower() # remove punctuation nopunct_sentence1 = ''.join([c for c in sentence1 if re.match("[a-z\-\' \n\t]", c)]) nopunct_sentence2 = ''.join([c for c in sentence2 if re.match("[a-z\-\' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # Calculate element numbers of intersection and sentence1 # combined_line = line1 + line2 # union_num = len(set(combined_line)) intersection_num = len(set(line1) & set(line2)) sentence1_num = len(set(line1)) # return score = |Q intersect R| / |Q| sim = float(intersection_num) / float(sentence1_num) return sim # # Test # list1 = load_sentences('data_not_sell') # list2 = load_sentences('data_sell_share') # sentence1 = list1[0] # sentence2 = list2[0] # score = sim_overlap(sentence1, sentence2) # print score
def classify(self, text): # Classify a sentence string or a list of sentences as norm and noNorm. if type(text) == str: # If text is a string, break it into tokens. # If among the tokens there is a modal verb, consider it a norm. # If it does not have a modal verb among the tokens, consider it a noNorm. tokens = wt(text) for token in tokens: if token in self.modal_verbs: return 'norm' return 'noNorm' elif type(text) == list and text: # If text is a list of sentences, classify each sentence based on the existence or absence of modal verbs among the sentence tokens. output = [] for sent in text: classified = 0 tokens = wt(sent) for token in tokens: if token in self.modal_verbs: output.append((sent, 'norm')) classified = 1 break if not classified: output.append((sent, 'noNorm')) return output
def get_overlap(sent1, sent2): sent1 = set(wt(sent1)) sent2 = set(wt(sent2)) try: value = max( len(sent1.intersection(sent2)) / len(sent1.union(sent2)), 0) return value except: return 0
def initialize_terms_and_postings(): global dictionary, postings for id in document_filenames: terms = wt(id['description']) terms = terms + (wt(id['title'])) #terms = ps(terms) unique_terms = set(terms) dictionary = dictionary.union(unique_terms) for term in unique_terms: postings[term][id['doc_id']] = terms.count(term)
def sem_wsd_corpus(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Get a list of synsets or terms, synsets for the terms whic have synsets, term for the ones don't Use internal maximization on corpus either internal word max or internal sentence max Return: synset_list (list of strings(terms that meets the POS criteria)) """ # get a term based corpus list for compute internal corpus maximization WSD corpus_list = [] for line in line_list: line = line.lower() nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) line_token = wt(nopunct_line) corpus_list = corpus_list+line_token corpus_list = list(set(corpus_list)) # start total_synset_sentence_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # list of wsd synsets synset_list = reduce(lambda x,y:x+y, [ [internal_sentence_max_WSD(corpus_list, x)] for x in line_token if wn.synsets(x) ]) # synset_list = reduce(lambda x,y:x+y, [ [internal_word_max_WSD(corpus_list, x)] for x in line_token if wn.synsets(x) ]) # format synset into term, e.g. from Synset.share.v.1 -> sharev1 synset_formatted_list = [] for synset in synset_list: formatted_term = re.sub('[^A-Za-z0-9]+', '', str(synset)) formatted_term = formatted_term.lstrip('Synset') synset_formatted_list.append(formatted_term) # list of terms without synset defination nonsynset_list = [ x for x in line_token if not wn.synsets(x)] # add synset list and nonsynset list together total_synset_list = synset_formatted_list + nonsynset_list # back to sentence as a string total_synset_sentence = ' '.join(total_synset_list) total_synset_sentence_list.append(total_synset_sentence) return total_synset_sentence_list
def select_by_pos(line_list): """ Input: line_list (list of strings(sentences/documents)) Iterates over all terms in lines, select terms with meaningful type of POS Return: POSed_list (list of strings(terms that meets the POS criteria)) """ POSed_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # POS pos_line = pos_tag(line_token) # filter line using POS info # only remain verbs, nouns, adverbs, adjectives filtered_line = [] for tagged_tuple in pos_line: term = tagged_tuple[0] tag = tagged_tuple[1] # find out all verbs, nouns, adverbs, adjectives if tag.startswith('V') or tag.startswith('N') or tag.startswith('R') or tag.startswith('J'): filtered_line.append(term) # back to sentence as a string POSed_sentence = ' '.join(filtered_line) POSed_list.append(POSed_sentence) return POSed_list
def Format_Problems(category): problems = {} #Something we want to delete in the question, in order to do query in Lemur Punc = set(string.punctuation) Aux_Art = {'is','was','are','were','did','does','do','the','a'} if category == "question": key = 88 #initialize problem No. else: key = 0 #initialize problem No. flag = 0 #indicate if this line is the content of this question with open(File_path+ category +'.txt','r') as f: for row in f.readlines(): if row.find('<num>') != -1: key = key + 1 elif row.find('<desc>') != -1: flag = 1 elif flag == 1: #extract the content of the problem #delete all punctuations content = ''.join(ch for ch in row[0:len(row)-2].replace("'",' ') if ch not in Punc) #delete the words we don't wanna include in the query question = '' for part in wt(content): if part not in Aux_Art: question = question + part + ' ' problems[key] = question[0:len(question)-1] flag = 0 return problems
def find_features(document): words = wt(document) features = {} for w in word_features: features[w] = (w in words) return features
def pos_tagging(line_list): """ Input: line_list (list of strings(sentences/documents)) Iterates over all terms in lines, add POS tag to words. E.g. 'said' -> ('said', 'VD') -> saidVD Return: tagged_list (list of strings(terms that meets the POS criteria)) """ tagged_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # POS pos_line = pos_tag(line_token) # filter line using POS info # only remain verbs, nouns, adverbs, adjectives tagged_line = [] for tagged_tuple in pos_line: term = tagged_tuple[0] tag = tagged_tuple[1] tagged_line.append(term+tag) # back to sentence as a string tagged_sentence = ' '.join(tagged_line) tagged_list.append(tagged_sentence) return tagged_list
def stemming(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Iterates over all terms in lines, stem them Return: stemmed_list (list of strings(terms that stemmed)) """ stemmed_list = [] stemmer = PorterStemmer() for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # list to store stemmed terms stemmed_line = [] for term in line_token: term = stemmer.stem_word(term) stemmed_line.append(term) # back to sentence as a string stemmed_sentence = ' '.join(stemmed_line) stemmed_list.append(stemmed_sentence) return stemmed_list
def lemmatizing(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Iterates over all terms in lines, lemmatize them using WordNetLemmatizer() Return: lemmatized_list (list of strings(terms that stemmed)) """ lemmatized_list = [] lemmatizer = WordNetLemmatizer() for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # stemming lemmatized_line = [] for term in line_token: term = lemmatizer.lemmatize(term) lemmatized_line.append(term) # back to sentence as a string lemmatized_sentence = ' '.join(lemmatized_line) lemmatized_list.append(lemmatized_sentence) return lemmatized_list
def pos_lemmatizing(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Iterates over all terms in lines, lemmatize them using WordNetLemmatizer() Terms are pre-processed using POS tagging to improve accuracy Return: lemmatized_list (list of strings(terms that stemmed)) """ lemmatized_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # Having punctuation removal before POS seems to be a bad idea # # remove punctuation # # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # # line = ''.join([c for c in line # # if re.match("[a-z\-\' \n\t]", c)]) # # this solve the problem above: # line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(line) # POS pos_line = pos_tag(line_token) # list for all lemmatized terms lemmatized_line = [] for lemma in pos_line: term = wn_lemmatize(lemma) lemmatized_line.append(term) # back to sentence as a string lemmatized_sentence = ' '.join(lemmatized_line) lemmatized_list.append(lemmatized_sentence) return lemmatized_list
def cacs(self): inputWord = [] for i in self.cleaned_1[0]: for j in wt(i): inputWord.append(j) tew = self.totalEngWords() engWord = [] try: for i in inputWord: if len(i) > 2: for j in tew: if i == j: engWord.append(i) except: print 'Error' engWord = list(set(engWord)) fa = self.finalAppend() correct = fa[1] fw = self.finalWrong() fWrong = fw[0] fCorrect = fw[1] cleanedData = [] for i in self.cleaned_1[0]: new_sentence = [] for j in wt(i): count = 0 for k in engWord: if j == k: count += 1 if count > 0: count_new = 1 for l in correct: if j == l: new_sentence.append(j.lower()) count_new = 0 break if count_new: for l in fWrong: if j == l: new_sentence.append(fCorrect[fWrong.index(l)]) break else: new_sentence.append(j.lower()) new_sentence = " ".join(new_sentence) cleanedData.append(new_sentence) return cleanedData
def scoreline(line1,line2,metric,ic=None): sw = stopwords.words('english') # import stopwords t1 = wt(line1) # tokenize line1 t2 = wt(line2) # tokenize line2 # for x in t1: # if x not in sw: # print x, wn.synsets(x) # get list of synsets for tokens of line1 syns1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in t1 if x not in sw]) # syns1 = reduce(lambda x,y:x+y,[ wn.synsets(x)[0] for x in t1 if ((x not in sw) and (wn.synsets(x))) ]) # get list of synsets for tokens of line2 # syns2 = reduce(lambda x,y:x+y,[wn.synsets(x)[0] for x in t2 if x not in sw]) runningscore = 0.0 runningcount = 0 print "syns1: ", syns1
def clean_words(nce): nce = nce.replace('’', '\'') nce = nce.replace('‘', '\'') words = wt(nce) words = set([wnl.lemmatize(word) for word in words]) words = set([stemmer.stem(word) for word in words]) return set(words)
def sim_overlap_idf(sentence1, sentence2): # remove punctuation nopunct_sentence1 = ''.join([c for c in sentence1 if re.match("[a-z\-\' \n\t]", c)]) nopunct_sentence2 = ''.join([c for c in sentence2 if re.match("[a-z\-\' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # intersection: (Q intersect R) intersection = set(line1) & set(line2) # calculate sum of idfs: Sum(idf_w) for w in (Q intersect R) sum_idf = 0.0 for item in intersection: idf = text_collection.idf(item) sum_idf += idf # Calculate element numbers of intersection and sentence1 intersection_num = len(intersection) sentence1_num = len(set(line1)) # sim = |Q intersect R| / |Q| # sim = float(intersection_num) / float(sentence1_num) # fix a bug try: sim = float(intersection_num) / float(sentence1_num) except: sim = 1 # sim = ( |Q intersect R| / |Q| ) * Sum(idf_w) for w in (Q intersect R) sim = sim * sum_idf return sim # # Test # list1 = load_sentences('data_not_sell') # list2 = load_sentences('data_sell_share') # sentence1 = list1[0] # sentence2 = list2[0] # score = sim_overlap_idf(sentence1, sentence2) # print score
def build_wiki_data(corpus, vocab , k= 30000): mostfreqk = vocab[-k+5:] vocabd = dict(zip( [w for w,v in mostfreqk ], range(k-5))) vocabd['DG'] = k-5 vocabd['DGDG'] = k-4 vocabd['DGDGDG'] = k-3 vocabd['DGDGDGDG'] = k-2 vocabd['UUUKKKNNN'] = k-1 assert(len(vocabd.keys()) == k ) newcorpus = [] newcorpusf = [] tokenizer = wt() print 'Total documents:',len(corpus) for aa,doc in enumerate(corpus): print aa, sys.stdout.flush() strlist = tokenizer.tokenize(doc.lower()) docidx = [] docfiltered = [] for w in strlist: try: docidx.append(vocabd[w]) #docfiltered.append(w) except: if w.isdigit() and len(w) <= 4: docidx.append(k-len(w)-1) #docfiltered.append('DG'*len(w)) continue else: try: w = remove_accents(w) except: continue if len(w.split("\\x")) > 1: continue elif len(w) <= 1 and w not in string.punctuation: continue elif len(re.sub(r'\W+', '', w)) == 0 and len(w) != 1: continue else: docidx.append(k-1) #docfiltered.append('UUUKKKNNN') newcorpus.append(docidx) #newcorpusf.append(docfiltered) return newcorpus, newcorpusf, vocabd
def freq_words(file, min=1, num=10): text = open(file).read() tokens = wt(text) print(len(tokens)) # 734989 print(len(set(tokens))) freqdist = nltk.FreqDist(t for t in tokens if len(t) <= min) print(len(freqdist)) # 4540 # freqdist.plot() return freqdist.keys()
def ner_sents(sents): """ Given a list of word-tokenized sentences, Returns the tree of the sentences. """ sentences = [] for sent in sents: sentences.append(NER(POS(wt(sent)))) return sentences
def build_wiki_data(corpus, vocab, k=30000): mostfreqk = vocab[-k + 5:] vocabd = dict(zip([w for w, v in mostfreqk], range(k - 5))) vocabd['DG'] = k - 5 vocabd['DGDG'] = k - 4 vocabd['DGDGDG'] = k - 3 vocabd['DGDGDGDG'] = k - 2 vocabd['UUUKKKNNN'] = k - 1 assert (len(vocabd.keys()) == k) newcorpus = [] newcorpusf = [] tokenizer = wt() print 'Total documents:', len(corpus) for aa, doc in enumerate(corpus): print aa, sys.stdout.flush() strlist = tokenizer.tokenize(doc.lower()) docidx = [] docfiltered = [] for w in strlist: try: docidx.append(vocabd[w]) #docfiltered.append(w) except: if w.isdigit() and len(w) <= 4: docidx.append(k - len(w) - 1) #docfiltered.append('DG'*len(w)) continue else: try: w = remove_accents(w) except: continue if len(w.split("\\x")) > 1: continue elif len(w) <= 1 and w not in string.punctuation: continue elif len(re.sub(r'\W+', '', w)) == 0 and len(w) != 1: continue else: docidx.append(k - 1) #docfiltered.append('UUUKKKNNN') newcorpus.append(docidx) #newcorpusf.append(docfiltered) return newcorpus, newcorpusf, vocabd
def get_length(): train = [] for sent in sentences: #字符长度 a1 = len(sent) #单词个数 a2 = len(wt(sent)) #'s个数,也就是所有格个数 a3 = sent.count('\'s') #标点符号的个数 比如.和? a4 = sent.count('.') + sent.count('?') #最长单词长度 b = wt(sent) a5 = len(max(b)) train.append([a1, a2, a3, a4, a5]) return train
def PreprocessCSV(csvfile, outputfile): """ output a csv file and return a word list. """ print("Start preprocessing %s ..." % csvfile) voc = [] dataframe = pandas.read_csv(csvfile, usecols=["Insult", "Comment"]) labels = dataframe.iloc[:, 0].tolist() sents = dataframe.iloc[:, 1].tolist() newsents = [] for sent in sents: # process sentences of samples # in case of blank, add a useless flag at the end sent = sent.strip("\"").lower() sent = sent.replace("\t", " ") sent = sent.replace("\n", " ") sent = sent.replace("\xa0", " ") sent = sent.replace("\xc2", " ") sent = sent.replace("\xc8", " ") sent = sent.replace("\xec", " ") sent = sent.replace("\x80", " ") sent = sent.replace("\xa6", " ") sent = re.sub("[$%^&*\[\]]", "", sent) tks = wt(sent) newtks = [] #built first-part features for tk in tks: if tk.isalpha(): tk = wnl().lemmatize(tk) newtks.append(tk) voc.append(tk) else: pass newsent = " ".join(newtks) newsent = newsent + " " + "auselessflag" newsents.append(newsent) # write the outputfile col_order = ["Insult", "Comment"] dataframe2 = pandas.DataFrame({"Insult": labels, "Comment": newsents}) dataframe2.to_csv(outputfile, index=False, columns=col_order) fdist = FreqDist(voc) keys = fdist.keys() wordlist = [] for key in keys: wordlist.append(key) print( "file \"%s\" is preprocessed, and there are %d keys in the return wordlist." % (csvfile, len(wordlist))) return wordlist
def length_ana(): for indx, nce in enumerate(corpus): result = [] for sent in nce: tk = wt(sent) result.append(len(tk)) print(result) fi = int(indx / 2) + 1 fig1 = plt.figure(fi) plt.subplot(int("21%s" % (indx % 2 + 1))) plt.hist(result) plt.xlabel('new concept number %s' % (indx + 1)) plt.show()
def extract_norms(self, contract_sents): # Return norms from a list of sentences. output = [] if type(contract_sents) != list: contract_sents = self.sent_tokenizer.tokenize(contract_sents) for sentence in contract_sents: tokens = wt(sentence) for token in tokens: if token in self.modal_verbs: output.append(sentence) break return output
def build_wiki_vocab(corpus): sentences = corpus tokenizer = wt() totalvocab = {} for s in sentences: strlist = tokenizer.tokenize(s.lower()) for w in strlist: try: totalvocab[w] +=1 except: totalvocab[w] = 1 sortvoc = sorted(totalvocab.iteritems(), key=operator.itemgetter(1)) return sortvoc
def build_wiki_vocab(corpus): sentences = corpus tokenizer = wt() totalvocab = {} for s in sentences: strlist = tokenizer.tokenize(s.lower()) for w in strlist: try: totalvocab[w] += 1 except: totalvocab[w] = 1 sortvoc = sorted(totalvocab.iteritems(), key=operator.itemgetter(1)) return sortvoc
def addInfo(file, info, categories): '''NOT IN USE''' data = loadData(file) tokens = wt(categories) words = [w for w in tokens] print(words) #print(data) if len(words) == 1: data[words[0]] = info elif len(words) == 2: data[words[0]][words[1]] = info elif len(words) == 3: data[words[0]][words[1]][words[2]] = info elif len(words) == 1: data[words[0]][words[1]][words[2]][words[3]] = info elif len(words) > 4 or len(words) == 0: msg = "Categories aren't right" return msg print(data[words[0]][words[1]])
def clean_tokenize(sentence): """ Tokenize a sentence (after removing stopwords and punctuation). Parameters ---------- sentence : string. The sentence from which we want to extract the keywords. Returns ------- keywords : list of strings. The list of the non stop words. """ stop = set(stopwords.words("english")) keywords = [ word.lower() for word in wt(re.sub("[^a-zA-Z]", " ", sentence)) if word.lower() not in stop ] return keywords
def do_search(): query = wt(input("Search query >> ")) if query == []: sys.exit() # find document ids containing all query terms. Works by # intersecting the posting lists for all query terms. relevant_document_ids = intersection( [set(postings[term].keys()) for term in query]) list(relevant_document_ids) if not relevant_document_ids: print("No documents matched all query terms.") else: scores = sorted([(id, similarity(query, id)) for id in relevant_document_ids], key=lambda x: x[1], reverse=True) print("Score: filename") for (id, score) in scores: print(str(score) + ": " + id)
def NgramWords(csvfile, n=2, minx=2, maxx=6): ngramlist = [] bgramdict = defaultdict(int) dataframe = pandas.read_csv(csvfile, usecols=["Comment"]) sents = dataframe.iloc[:, 0].tolist() for sent in sents: words = wt(sent) if len(words) > ( n + 2): #because there is a useless flag at the end of text for i in range((len(words) - n)): bword = "" for j in range(n): bword += words[i + j] bgramdict[bword] += 1 for key in bgramdict.keys(): if bgramdict[key] > minx and bgramdict[key] < maxx: ngramlist.append(key) print("there are %d %d-gram words in ngramlist from %s." % (len(ngramlist), n, csvfile)) #print(ngramlist[:10]) return ngramlist
def fristXpq(sDat, max_len, x=0): token = wt(sDat.lower()) lt = [] if len(sDat) < max_len: max_len = len(sDat) for i in range(1, max_len + 1): text = [] if i > 1: text = textToWordList(token, i) else: text = token if x == 0: fdist = fd(text).most_common() else: fdist = fd(text).most_common(x) fdist = [[fdist[k][0], fdist[k][1]] for tp, k in zip(fdist, range(len(fdist)))] # sfdist=[[fdist[k][0],fdist[k][1]/len(text)] for tp,k in zip(fdist,range(len(fdist)))] lt.extend(fdist) # lt=fqtopq(lt,len(text)) return (lt)
def trees_2_toks(sentences): """ From a list of chunked sentences returns the tokenized sentences without any POS or NER tag. """ sents = [] for sent in sentences: # transforms in conll and delete the syntactic infos conll = nltk.chunk.tree2conllstr(sent) lines = conll.split('\n') s = '' for line in lines: line = line.split(' ') s = s + line[0] + ' ' sents.append(wt(s[:-1])) # removes the last white character return sents
def sem_firstsense(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Get a list of synsets or terms, synsets for the terms whic have synsets, term for the ones don't Use first senses Return: synset_list (list of strings(terms that meets the POS criteria)) """ total_synset_sentence_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # list of first-sense synsets # t0 = time() synset_list = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line_token if wn.synsets(x) ]) # # First run uses about 2.5 s # print "Done in %fs" % (time() - t0) # format synset into term, e.g. Synset.share.v.1 -> sharev1 synset_formatted_list = [] for synset in synset_list: formatted_term = re.sub('[^A-Za-z0-9]+', '', str(synset)) formatted_term = formatted_term.lstrip('Synset') synset_formatted_list.append(formatted_term) # list of terms without synset defination nonsynset_list = [ x for x in line_token if not wn.synsets(x)] # add synset list and nonsynset list together total_synset_list = synset_formatted_list + nonsynset_list # back to sentence as a string total_synset_sentence = ' '.join(total_synset_list) total_synset_sentence_list.append(total_synset_sentence) return total_synset_sentence_list
def extract_key_words(sentence, score_function, n, *args): """ Function that extracts the most relevant keywords according to the scrabble_score. Parameters ---------- sentence : string. In our case the question from which we want to extract the keywords. score_function : function. Function that computes the score given a word. *args : arguments of the function. n : int. The number of keywords we want to extract (descending order). Returns ------- keywords : list of strings. The list of the n most relevant keywords according the the score. """ words = wt(sentence) scores_words = {} for word in words: scores_words[word] = score_function(word, *args) keywords = sorted(scores_words, key=scores_words.get, reverse=True)[:n] return keywords
def q1(): # 1. Print the number of word tokens # YOUR CODE from nltk.corpus import gutenberg as gb #if you want to print all file ids in gutenberg archive #print(gb.fileids()) file_id = 'austen-sense.txt' word_list = gb.words(file_id) print(len(word_list)) # 2. Print the number of word types # YOUR CODE print(len( set( [ w.lower() for w in word_list ]) )) # 3. Print all tokens in the first sentence # YOUR CODE sent_list = gb.sents(file_id) print(' '.join(sent_list[0])) # if you want to tokenize a string raw = 'i have a book.' from nltk import word_tokenize as wt word_list = wt(raw)
def pos_bagging(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Use POS tags to replace all words Return: tagged_list (list of strings(terms that meets the POS criteria)) """ bagged_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # Having punctuation removal before POS seems to be a bad idea # # remove punctuation # # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # # line = ''.join([c for c in line # # if re.match("[a-z\-\' \n\t]", c)]) # # this solve the problem above: # line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(line) # POS pos_line = pos_tag(line_token) # filter line using POS info # only remain verbs, nouns, adverbs, adjectives bagged_line = [] for tagged_tuple in pos_line: term = tagged_tuple[0] tag = tagged_tuple[1] bagged_line.append(tag) # back to sentence as a string bagged_sentence = ' '.join(bagged_line) bagged_list.append(bagged_sentence) return bagged_list
def sim_overlap_phrasal(sentence1, sentence2): # lowercase sentence1 = sentence1.lower() sentence2 = sentence2.lower() # remove punctuation nopunct_sentence1 = ''.join([c for c in sentence1 if re.match("[a-z\-\' \n\t]", c)]) nopunct_sentence2 = ''.join([c for c in sentence2 if re.match("[a-z\-\' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # finders for bigram and trigram finder_bi_line1 = collocations.BigramCollocationFinder.from_words(line1) finder_bi_line2 = collocations.BigramCollocationFinder.from_words(line2) finder_tri_line1 = collocations.TrigramCollocationFinder.from_words(line1) finder_tri_line2 = collocations.TrigramCollocationFinder.from_words(line2) # find bigram / trigram scored_bi_line1 = finder_bi_line1.score_ngrams(bigram_measures.raw_freq) scored_bi_line2 = finder_bi_line2.score_ngrams(bigram_measures.raw_freq) scored_tri_line1 = finder_tri_line1.score_ngrams(bigram_measures.raw_freq) scored_tri_line2 = finder_tri_line2.score_ngrams(bigram_measures.raw_freq) # generate lists contain all the bigram or trigram for line1 and line2 list_bi_line1 = sorted(bigram for bigram, score in scored_bi_line1) list_bi_line2 = sorted(bigram for bigram, score in scored_bi_line2) list_tri_line1 = sorted(trigram for trigram, score in scored_tri_line1) list_tri_line2 = sorted(trigram for trigram, score in scored_tri_line2) # find the common elements from two sets of bigram in two sentences common_set_bi = [i for i in list_bi_line1 if i in list_bi_line2] common_set_tri = [i for i in list_tri_line1 if i in list_tri_line2] # Calculate element numbers of intersection and sentence1 # combined_line = line1 + line2 # union_num = len(set(combined_line)) intersection_len = len(set(line1) & set(line2)) sentence1_len = len(set(line1)) sentence2_len = len(set(line1)) # Overlap (phrasal) score # Note, here we only consider trigram and bigram overlap_score = 9*len(common_set_tri) + 4*len(common_set_bi) + intersection_len # Normalization as defined in Ponzetto et al. 2007 sim = float(overlap_score) / (sentence1_len+sentence2_len) sim = tanh(sim) return sim # # Test # list1 = load_sentences('data_not_sell') # list2 = load_sentences('data_sell_share') # sentence1 = list1[1]cd D # sentence2 = list2[3] # score = sim_overlap_phrasal(sentence1, sentence2) # print score
__author__ = "Soumik" import nltk, re, pprint from nltk import word_tokenize as wt import codecs f = open("pos.wn") raw = f.read() tokens = wt(raw) text = nltk.Text(tokens) g = [] for x in tokens: y = x.replace("_", " ") g.append(y) print(g) f1 = open("neg.wn") raw1 = f1.read() tokens1 = wt(raw1) text1 = nltk.Text(tokens1) h = [] for z in tokens1: a = z.replace("_", " ") h.append(a) print("\n", h) pcount = 0 ncount = 0 for x in h: if x in open("posTweets.txt", encoding="utf8").read(): pcount = pcount + 1 print("The number of positive tweets are:%s", pcount)
def main(f_1, f_2): """ Extracts basic stats from 2 text files """ ###################################### # Text extraction ###################################### t_1 = '' t_2 = '' with open(f_1, 'r') as f: t_1 = f.read() with open(f_2, 'r') as f: t_2 = f.read() ###################################### # Text cleaning ###################################### t_1 = cl.clean_text(t_1) t_2 = cl.clean_text(t_2) ###################################### # Extracting informations ###################################### # Sentence tokens sents_1 = st.tokenize(t_1) sents_2 = st.tokenize(t_2) # Word tokens (by sentence) s_toks_1 = [] s_toks_2 = [] for s_1, s_2 in zip(sents_1, sents_2): s_toks_1.append(wt(s_1)) s_toks_2.append(wt(s_2)) # Word tokens (unique list) toks_1 = wt(t_1) toks_2 = wt(t_2) # Initialize output output = OUT.init_html() soup = BeautifulSoup(output, 'html.parser') col_1 = soup.find('div', {'id': 'text-1-container'}) col_2 = soup.find('div', {'id': 'text-2-container'}) title_1 = soup.new_tag('p', attrs={'id' : 'title-1'}) title_1.insert(1,'Alice in Wonderland') title_2 = soup.new_tag('p', attrs={'id' : 'title-2'}) title_2.insert(1,'Second Variety') author_1 = soup.new_tag('p', attrs={'id' : 'author-1'}) author_1.insert(1,'Lewis Carroll') author_2 = soup.new_tag('p', attrs={'id' : 'author-2'}) author_2.insert(1,'Philip Kindred Dick') col_1.append(title_1) col_1.append(author_1) col_2.append(title_2) col_2.append(author_2) # Print number of sentences nc_1 = OUT.create_container('Number of sentences', str(len(sents_1)), soup) nc_2 = OUT.create_container('Number of sentences', str(len(sents_2)), soup) col_1.append(nc_1) col_2.append(nc_2) print('added Number of sentences to project/index.html') # Print number of tokens nc_1 = OUT.create_container('Number of tokens', str(len(toks_1)), soup) nc_2 = OUT.create_container('Number of tokens', str(len(toks_2)), soup) col_1.append(nc_1) col_2.append(nc_2) print('added Number of tokens to project/index.html') # Print mean sentence length nc_1 = OUT.create_container('Mean sentence length', str(tool.minify(LC.mean_len(s_toks_1))), soup) nc_2 = OUT.create_container('Mean sentence length', str(tool.minify(LC.mean_len(s_toks_2))), soup) col_1.append(nc_1) col_2.append(nc_2) print('added Mean sentence length to project/index.html') # Print word length nc_1 = OUT.create_container('Mean word length', str(tool.minify(LC.mean_len(toks_1))), soup) nc_2 = OUT.create_container('Mean word length', str(tool.minify(LC.mean_len(toks_2))), soup) col_1.append(nc_1) col_2.append(nc_2) print('added Mean word length to project/index.html') # Vocab voc_1 = LC.create_vocab(toks_1) voc_2 = LC.create_vocab(toks_2) # Print vocabulary dimension nc_1 = OUT.create_container('Vocabulary dimension', str(len(voc_1)), soup) nc_2 = OUT.create_container('Vocabulary dimension', str(len(voc_2)), soup) col_1.append(nc_1) col_2.append(nc_2) print('added Vocabulary dimension to project/index.html') # Frequencies freq_1 = LC.freqs(toks_1, voc_1) freq_2 = LC.freqs(toks_2, voc_2) # Hapax_list hapax_1 = LC.get_hapax(freq_1) hapax_2 = LC.get_hapax(freq_2) # Print number of hapaxes nc_1 = OUT.create_container('Number of hapaxes', str(len(hapax_1)), soup) nc_2 = OUT.create_container('Number of hapaxes', str(len(hapax_2)), soup) col_1.append(nc_1) col_2.append(nc_2) print('added Number of hapaxes to project/index.html') print('PLOTTING HAPAXES - DISTRIBUTION...') # Plotting Hapaxes' distribution LC.hapax_distr(toks_1, 1000, "Alice in Wonderland - Carrol") LC.hapax_distr(toks_2, 1000, "Second Variety - Dick") # Print plots nc_1 = OUT.create_img_container('Hapax distribution', '../plots/Alice in Wonderland - Carrol.svg', soup) nc_2 = OUT.create_img_container('Hapax distribution', '../plots/Second Variety - Dick.svg', soup) col_1.append(nc_1) col_2.append(nc_2) print('added Hapax distribution to project/index.html') # POS tagging POS_1 = POS(toks_1) POS_2 = POS(toks_2) # POS frequency """ Getting the frequency of every POS in the two texts """ POS_freqs_1 = LC.get_POS_freqs(POS_1) POS_freqs_2 = LC.get_POS_freqs(POS_2) NN_1 = POS_freqs_1['NN'] + POS_freqs_1['NNS'] + POS_freqs_1['NNP'] + POS_freqs_1['NNPS'] VB_1 = POS_freqs_1['VB'] + POS_freqs_1['VBD'] + POS_freqs_1['VBG'] + POS_freqs_1['VBN'] + POS_freqs_1['VBP'] + POS_freqs_1['VBZ'] NN_2 = POS_freqs_2['NN'] + POS_freqs_2['NNS'] + POS_freqs_2['NNP'] + POS_freqs_2['NNPS'] VB_2 = POS_freqs_2['VB'] + POS_freqs_2['VBD'] + POS_freqs_2['VBG'] + POS_freqs_2['VBN'] + POS_freqs_2['VBP'] + POS_freqs_2['VBZ'] # Print Nouns / Verbs nc_1 = OUT.create_container('Nouns / Verbs', str(tool.minify(NN_1 / VB_1)), soup) nc_2 = OUT.create_container('Nouns / Verbs', str(tool.minify(NN_2 / VB_2)), soup) col_1.append(nc_1) col_2.append(nc_2) print('added Nouns / Verbs to project/index.html') POS_rank_1 = LC.rank(POS_freqs_1) POS_rank_2 = LC.rank(POS_freqs_2) # Print most frequent POS nc_1 = OUT.complex_container('Most frequent POS', list(POS_rank_1)[:10], soup) nc_2 = OUT.complex_container('Most frequent POS', list(POS_rank_2)[:10], soup) col_1.append(nc_1) col_2.append(nc_2) print('added Most frequent POS to project/index.html') # POS bigrams """ Getting the frequency of every POS-couple in the two texts """ POS_bi_1 = LC.tag_bigrams(POS_1) POS_bi_2 = LC.tag_bigrams(POS_2) POS_bi_set_1 = list(set(POS_bi_1)) POS_bi_set_2 = list(set(POS_bi_2)) POS_bi_freq_1 = LC.freqs(POS_bi_1, POS_bi_set_1) POS_bi_freq_2 = LC.freqs(POS_bi_2, POS_bi_set_2) # POS Conditioned probabilities """ Now that we have the frequencies of every POS and every couple of POS, we can compute the conditioned probability for each couple. """ POS_cond_prob_1 = LC.get_cond_prob(POS_bi_freq_1, POS_freqs_1) POS_cond_prob_2 = LC.get_cond_prob(POS_bi_freq_2, POS_freqs_2) # Print conditioned-probability nc_1 = OUT.more_complex_container('Top conditioned-probable POS', LC.rank(POS_cond_prob_1)[:10], soup, ['', 'bigrams', 'probability']) nc_2 = OUT.more_complex_container('Top conditioned-probable POS', LC.rank(POS_cond_prob_2)[:10], soup, ['', 'bigrams', 'probability']) col_1.append(nc_1) col_2.append(nc_2) print('added Top conditioned-probable POS to project/index.html') # POS Local Mutual Information POS_bi_LMI_1 = LC.get_LMI(POS_bi_freq_1, len(POS_bi_1), POS_freqs_1, len(POS_1), POS_bi_set_1) POS_bi_LMI_2 = LC.get_LMI(POS_bi_freq_2, len(POS_bi_2), POS_freqs_2, len(POS_2), POS_bi_set_2) ranked_POS_bi_LMI_1 = LC.rank(POS_bi_LMI_1) ranked_POS_bi_LMI_2 = LC.rank(POS_bi_LMI_2) # Print related bigrams nc_1 = OUT.more_complex_container('Top LMI-related bigrams', ranked_POS_bi_LMI_1[:10], soup, ['', 'bigrams', 'LMI']) nc_2 = OUT.more_complex_container('Top LMI-related bigrams', ranked_POS_bi_LMI_2[:10], soup, ['', 'bigrams', 'LMI']) col_1.append(nc_1) col_2.append(nc_2) print('added Top LMI-related bigrams to project/index.html') # Final prints title_1 = soup.new_tag('p', attrs={'id' : 'book-1'}) title_1.insert(1,'Alice in Wonderland') soup.body.append(title_1) title_2 = soup.new_tag('p', attrs={'id' : 'book-2'}) title_2.insert(1,'Second Variety') soup.body.append(title_2) # Exporting output with open('output/index_1.html', 'w') as h: h.write(str(soup))
def sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic, alpha=0.05): # Bug fix: lower sentence1 = sentence1.lower() sentence2 = sentence2.lower() # import stopwords sw = stopwords.words('english') # remove punctuation nopunct_sentence1 = ''.join([c for c in sentence1 if re.match("[a-z\-\' \n\t]", c)]) nopunct_sentence2 = ''.join([c for c in sentence2 if re.match("[a-z\-\' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # get list of synsets only using first senses, without stopword elimination synset_list1 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line1 if wn.synsets(x) ]) synset_list2 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line2 if wn.synsets(x) ]) # # get the synset list for each sentence, containing all WordNet senses # # with stopword elimination # synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line1 if x not in sw]) # synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line2 if x not in sw]) # # get list of synsets only using first senses, with stopword elimination # synset_list1 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line1 if ((x not in sw) and wn.synsets(x)) ]) # synset_list2 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in line2 if ((x not in sw) and wn.synsets(x)) ]) mark_list = [] # get Wordnet similarity score for <metric> for each pair created from both synset lists for synset1 in set(synset_list1): for synset2 in set(synset_list2): if ic is not None: try: mark = metric(synset1, synset2, ic) if mark is None: mark = 0.0 except: mark = 0.0 # handle infinitity mark for jcn measure if mark == 1e+300: mark = 1.0 mark_list.append(mark) else: try: mark = metric(synset1, synset2) if mark is None: mark = 0.0 except: mark = 0.0 mark_list.append(mark) # sort mark_list to be from highest to lowest mark_list.sort() mark_list.reverse() # calculate threshold given alpha and length of mark_list threshold = alpha * len(mark_list) threshold = int(round(threshold)) # build the top alpha list of marks top_alpha_mark_list = mark_list[0:threshold] # add up individual scores, divide by number of individual scores sim = sum(top_alpha_mark_list) / len(top_alpha_mark_list) return sim # # Test # list1 = load_sentences('data_not_sell') # list2 = load_sentences('data_sell_share') # sentence1 = list1[0] # sentence2 = list2[1] # brown_ic = wordnet_ic.ic('ic-brown.dat') # semcor_ic = wordnet_ic.ic('ic-semcor.dat') # # sim_sem_firstsense_alpha(sentence1, sentence2) # score = sim_sem_firstsense_alpha(sentence1, sentence2) # print 'path: ', score # score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.lch_similarity) # print 'lch : ', score # score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.wup_similarity) # print 'wup : ', score # score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.res_similarity, ic=brown_ic) # print 'res - brown : ', score # score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.res_similarity, ic=semcor_ic) # print 'res - semcor : ', score # score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic) # print 'jcn : ', score # score = sim_sem_firstsense_alpha(sentence1, sentence2, metric=wn.lin_similarity, ic=brown_ic) # print 'lin : ', score # Sample results: # sentence1 = list1[0] # sentence2 = list2[1] # alpha = 0.2 # path: 0.255693843194 # lch : 1.70093033207 # wup : 0.468924493692 # res - brown : 2.33328289008 # res - semcor : 2.18274083157 # jcn : 0.2375842434 # lin : 0.273913605124
def pos_negation_bigram(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data POS tag the line, match patterns of negations to form bigram terms Return: pos_neg_bigram_list (list of strings(terms that meets the POS criteria)) """ neg_verb_set = ['not', 'never', 'neither'] neg_noun_set = ['without'] verb_window = 10 noun_window = 3 pos_neg_bigram_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # tokenize line_token = wt(line) # base for return pos_neg_bigram_line = [] # POS pos_line = pos_tag(line_token) # ========= # POS part for tagged_tuple in pos_line: term = tagged_tuple[0] tag = tagged_tuple[1] pos_neg_bigram_line.append(term+tag) # back to sentence as a string # ========= # Then negation bigram construction part # first iteration to find flag words neg_verb = None neg_verb_flag = None neg_noun_flag = None for i, tagged_tuple in enumerate(pos_line): term = tagged_tuple[0] if term in neg_verb_set: neg_verb_flag = i neg_verb = term elif term in neg_noun_set: neg_noun_flag = i # second iteration to find neg_verb match and form bigram if neg_verb_flag != None: for i, tagged_tuple in enumerate(pos_line): term = tagged_tuple[0] tag = tagged_tuple[1] if (i-neg_verb_flag)<=verb_window and (i-neg_verb_flag)>0 and tag.startswith('V'): pos_neg_bigram_line.append(neg_verb+term) # third iteration to find neg_noun match and form bigram if neg_noun_flag != None: for i, tagged_tuple in enumerate(pos_line): term = tagged_tuple[0] tag = tagged_tuple[1] if (i-neg_noun_flag)<=noun_window and (i-neg_noun_flag)>0 and tag.startswith('N'): pos_neg_bigram_line.append("without"+term) # back to sentence as a string neg_bigram_sentence = ' '.join(pos_neg_bigram_line) pos_neg_bigram_list.append(neg_bigram_sentence) return pos_neg_bigram_list
def main(f_1, f_2): """ Extracts NER stats from 2 text files """ ###################################### # Text extraction ###################################### t_1 = '' t_2 = '' with open(f_1, 'r') as f: t_1 = f.read() with open(f_2, 'r') as f: t_2 = f.read() ###################################### # Text cleaning ###################################### t_1 = cl.clean_text(t_1) t_2 = cl.clean_text(t_2) # Initialize output output = OUT.init_html() soup = BeautifulSoup(output, 'html.parser') col_1 = soup.find('div', {'id': 'text-1-container'}) col_2 = soup.find('div', {'id': 'text-2-container'}) title_1 = soup.new_tag('p', attrs={'id': 'title-1'}) title_1.insert(1, 'Alice in Wonderland') title_2 = soup.new_tag('p', attrs={'id': 'title-2'}) title_2.insert(1, 'Second Variety') author_1 = soup.new_tag('p', attrs={'id': 'author-1'}) author_1.insert(1, 'Lewis Carroll') author_2 = soup.new_tag('p', attrs={'id': 'author-2'}) author_2.insert(1, 'Philip Kindred Dick') col_1.append(title_1) col_1.append(author_1) col_2.append(title_2) col_2.append(author_2) ###################################### # Extracting the useful elements ###################################### # Sentence tokens sents_1 = st.tokenize(t_1) sents_2 = st.tokenize(t_2) # Word tokens (by sentence) s_toks_1 = [] s_toks_2 = [] for s_1, s_2 in zip(sents_1, sents_2): s_toks_1.append(wt(s_1)) s_toks_2.append(wt(s_2)) # Word tokens (unique list) toks_1 = wt(t_1) toks_2 = wt(t_2) # NER tagging POS_NE_1 = NE.ner_sents(sents_1) POS_NE_2 = NE.ner_sents(sents_2) PERSON_sents_1 = NE.get_ner_sents(POS_NE_1, ['PERSON']) PERSON_sents_2 = NE.get_ner_sents(POS_NE_2, ['PERSON']) # Get the entities of specific NER_tag ENTITIES_1 = NE.get_ner_entities(PERSON_sents_1, ['PERSON']) ENTITIES_2 = NE.get_ner_entities(PERSON_sents_2, ['PERSON']) PEOPLE_1 = LC.rank(ENTITIES_1['PERSON'])[:10] PEOPLE_2 = LC.rank(ENTITIES_2['PERSON'])[:10] # Print most frequent characters nc_1 = OUT.complex_container('Most frequent characters', PEOPLE_1, soup) nc_2 = OUT.complex_container('Most frequent characters', PEOPLE_2, soup) col_1.append(nc_1) col_2.append(nc_2) print('added Most frequent characters to project/index.html') # Extract only sentence in which appear the selected PEOPLE PEOPLE_1 = tool.tup_2_list(PEOPLE_1) PEOPLE_2 = tool.tup_2_list(PEOPLE_2) useful_sents_1 = NE.meaningful_sents(PERSON_sents_1, ['PERSON'], PEOPLE_1) useful_sents_2 = NE.meaningful_sents(PERSON_sents_2, ['PERSON'], PEOPLE_2) ###################################### # Extracting informations ###################################### """ Now that we have extracted the useful sentences, we can begin to mine the information inside them. """ # NER_tags to look for in the sentence NER_tags = ['PERSON', 'LOC', 'GPE', 'DATE', 'TIME'] POS_tags = [ 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' ] # For every sent: find the Person and {PERSON_1 : [link1: freq, link2: freq], PERSON_2: [link1: freq, link2: freq]} links_1 = NE.extract_links(useful_sents_1, {'PERSON': PEOPLE_1}, NER_tags, POS_tags) links_2 = NE.extract_links(useful_sents_2, {'PERSON': PEOPLE_2}, NER_tags, POS_tags) # Extract the frequency of each link links_freqs_1 = NE.extract_link_freqs(links_1) links_freqs_2 = NE.extract_link_freqs(links_2) #with open('tests/links_freqs_1.txt', 'w') as output: #output.write(str(links_freqs_1)) # Extract the most frequent items for each field most_freq_1 = NE.rank_link_freqs(links_freqs_1, 10) most_freq_2 = NE.rank_link_freqs(links_freqs_2, 10) # Print infos TEXT_1 for NER in most_freq_1: for leaf in most_freq_1[NER]: for link_type in most_freq_1[NER][leaf]: nc_1 = OUT.complex_container(leaf + ' + ' + link_type, most_freq_1[NER][leaf][link_type], soup) col_1.append(nc_1) print('added', leaf, ' + ', link_type, 'to project/index.html') # Print infos TEXT_2 for NER in most_freq_2: for leaf in most_freq_2[NER]: for link_type in most_freq_2[NER][leaf]: nc_2 = OUT.complex_container(leaf + ' + ' + link_type, most_freq_2[NER][leaf][link_type], soup) col_2.append(nc_2) print('added', leaf, ' + ', link_type, 'to project/index.html') # Get the max-prob sentence of lenght between 8 and 12 token # Select sents of lenght between 8 and 12 token containing selected_sents_1 = tool.sents_of_len(NE.trees_2_toks(useful_sents_1), 8, 12) selected_sents_2 = tool.sents_of_len(NE.trees_2_toks(useful_sents_2), 8, 12) sents_4_person_1 = NE.assign_sent_2_person(selected_sents_1, PEOPLE_1) sents_4_person_2 = NE.assign_sent_2_person(selected_sents_2, PEOPLE_2) # Compute the probability for each sentence for each important person in the text max_markow_1 = {} for person in sents_4_person_1: max_markow_1[person] = NE.get_max_markow(sents_4_person_1[person], fd(toks_1), len(t_1), False) max_markow_2 = {} for person in sents_4_person_2: max_markow_2[person] = NE.get_max_markow(sents_4_person_2[person], fd(toks_2), len(t_2), False) # Print Markov probabilities for person in max_markow_1: nc_1 = OUT.tok_sent_container('Probable sentence for ' + person, max_markow_1[person], soup) col_1.append(nc_1) print('added ' + 'Probable sentence for ' + person + 'to project/index.html') for person in max_markow_2: nc_2 = OUT.tok_sent_container('Probable sentence for ' + person, max_markow_2[person], soup) col_2.append(nc_2) print('added ' + 'Probable sentence for ' + person + 'to project/index.html') # Final prints title_1 = soup.new_tag('p', attrs={'id': 'book-1'}) title_1.insert(1, 'Alice in Wonderland') soup.body.append(title_1) title_2 = soup.new_tag('p', attrs={'id': 'book-2'}) title_2.insert(1, 'Second Variety') soup.body.append(title_2) # Exporting output with open('output/index_2.html', 'w') as h: h.write(str(soup))
def sim_sem_firstsense_pos(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic): # def sim_sem_firstsense_pos(sentence1, sentence2, metric=wn.path_similarity, ic=None): # Bug fix: lower sentence1 = sentence1.lower() sentence2 = sentence2.lower() # import stopwords sw = stopwords.words("english") # remove punctuation nopunct_sentence1 = "".join([c for c in sentence1 if re.match("[a-z\-' \n\t]", c)]) nopunct_sentence2 = "".join([c for c in sentence2 if re.match("[a-z\-' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # POS pos_line1 = pos_tag(line1) pos_line2 = pos_tag(line2) # filter line1 and line2 using POS info # only remain verbs, nouns, adverbs, adjectives filtered_line1 = [] filtered_line2 = [] for tagged_tuple in pos_line1: term = tagged_tuple[0] tag = tagged_tuple[1] # find out all verbs, nouns, adverbs, adjectives if tag.startswith("V") or tag.startswith("N") or tag.startswith("R") or tag.startswith("J"): filtered_line1.append(term) for tagged_tuple in pos_line2: term = tagged_tuple[0] tag = tagged_tuple[1] # find out all verbs, nouns, adverbs, adjectives if tag.startswith("V") or tag.startswith("N") or tag.startswith("R") or tag.startswith("J"): filtered_line2.append(term) # get list of synsets only using first senses, without stopword elimination # synset_list1 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in filtered_line1 if wn.synsets(x) ]) # synset_list2 = reduce(lambda x,y:x+y, [ [wn.synsets(x)[0]] for x in filtered_line2 if wn.synsets(x) ]) # # get the synset list for each sentence, containing all WordNet senses # # with stopword elimination # synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line1 if x not in sw]) # synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in line2 if x not in sw]) # # get list of synsets only using first senses, with stopword elimination synset_list1 = reduce(lambda x, y: x + y, [[wn.synsets(x)[0]] for x in line1 if ((x not in sw) and wn.synsets(x))]) synset_list2 = reduce(lambda x, y: x + y, [[wn.synsets(x)[0]] for x in line2 if ((x not in sw) and wn.synsets(x))]) runningscore = 0.0 runningcount = 0 # get Wordnet similarity score for <metric> for each pair created from both synset lists for synset1 in set(synset_list1): for synset2 in set(synset_list2): if ic is not None: try: mark = metric(synset1, synset2, ic) if mark is None: mark = 0.0 except: mark = 0.0 # handle infinitity mark for jcn measure if mark == 1e300: mark = 1.0 runningscore += mark else: try: mark = metric(synset1, synset2) if mark is None: mark = 0.0 except: mark = 0.0 runningscore += mark runningcount += 1 # add up individual scores, divide by number of individual scores sim = runningscore / runningcount return sim
import nltk import sklearn from nltk import word_tokenize as wt # --------- optional start ------------ # f = open('NEWS.txt', encoding='utf-8') raw = f.read() # WORD TOKENIZER tokens = wt(raw) # CREATE NLTK TEXT txt = nltk.Text(tokens) # ----------optional end -------------- # # CLASSIFIER from nltk.corpus import names # SINGLE FEATURE GENERATOR & CLASSIFICATION # # ------------------- start --------------- # def feature_generator(word): return {'last_letter': word[-1]} print(feature_generator('praveen')) import random names = ([(name, 'male') for name in names.words('male.txt')] + \ [(name, 'female') for name in names.words('female.txt')])
def preprocess_marco(input_file="../data/dev_v1.1.json.gz", vocab_file="../data/vocab_marco.json", data_file="../data/marco_dev.txt", max_sent_len=20, max_doc_len=100, vocab_size=10000): lines = gzip.open(input_file, 'r').readlines() vocab_writer = codecs.open(vocab_file, 'w', "utf-8") data_writer = codecs.open(data_file, "w", "utf-8") vocab = {} for line in lines: content = json.loads(line.decode("utf-8")) query = content["query"] passages = content["passages"] answers = content["answers"] query_token = wt(query.lower()) query_token = list(map(lambda t: t.lower(), query_token)) for token in query_token: if token not in vocab: vocab[token] = 0 vocab[token] += 1 if len(answers) == 1: data_writer.write(' '.join(query_token[:max_sent_len]) + '\t') for passage in passages: tokens = wt(passage["passage_text"].lower()) for token in tokens: if token not in vocab: vocab[token] = 0 vocab[token] += 1 if passage["is_selected"] and len(answers) == 1: data_writer.write(' '.join(tokens[:max_doc_len]) + '\t') for answer in answers: answer_token = wt(answer.lower()) for token in answer_token: if token not in vocab: vocab[token] = 0 vocab[token] += 1 if len(answers) == 1: data_writer.write(' '.join(answer_token[:max_sent_len]) + '\n') sorted_count = sorted(vocab.items(), key=lambda t: t[1], reverse=True) sorted_count = list(map(lambda t: t[0], sorted_count[:vocab_size - 1])) json_out = { "index_to_token": {i + 1: t for i, t in enumerate(sorted_count)}, "token_to_index": {t: i + 1 for i, t in enumerate(sorted_count)} } json_out["index_to_token"][vocab_size] = "UNK" json_out["token_to_index"]["UNK"] = vocab_size json.dump(json_out, vocab_writer) vocab_writer.flush() data_writer.flush() vocab_writer.close() data_writer.close()
def ExtractFeature1(processedfile, wordlist1=False, wordlist2=False, wordlist3=False): """ processedfile : csvfile, (output of 'PreprocessCSV') wordlist1/2/3 : list """ features = [] dataframe = pandas.read_csv(processedfile, usecols=["Insult", "Comment"]) labels = dataframe.iloc[:, 0].tolist() sents = dataframe.iloc[:, 1].tolist() # unigram feature for sent in sents: #every sent generates a feature vector words = wt(sent) sent_fea = [] #first part feature if wordlist1: cur_fea1 = [0] * len(wordlist1) for word in words: if word in wordlist1: fea_ind = wordlist1.index(word) cur_fea1[fea_ind] += 1 else: pass sent_fea += cur_fea1 cur_fea2 = [0] for word in words: if word in BW: cur_fea2[0] += 1 if word.isupper(): cur_fea2[0] += 1 sent_fea += cur_fea2 cur_fea3 = [0] if cur_fea2[0] == 1: if "you" in sent: cur_fea3[0] += 1 sent_fea += cur_fea3 # bigram feature if wordlist2: cur_bgram = [0] * (len(wordlist2)) for word in words: if len(words) > 2: for i in range((len(words) - 1)): bword = words[i] + words[i + 1] if bword in wordlist2: fea_ind = wordlist2.index(bword) cur_bgram[fea_ind] += 1 else: pass sent_fea += cur_bgram # trigram feature if wordlist3: cur_trigram = [0] * (len(wordlist3)) for word in words: if len(words) > 3: for i in range((len(words) - 2)): tword = "" for j in range(3): tword += words[i + j] if tword in wordlist3: fea_ind = wordlist3.index(tword) cur_trigram[fea_ind] += 1 else: pass sent_fea += cur_trigram features.append(sent_fea) print("labels and features are extracted from file %s." % processedfile) return labels, features
def sim_sem_intermax(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic): # Bug fix: lower sentence1 = sentence1.lower() sentence2 = sentence2.lower() # import stopwords sw = stopwords.words('english') # remove punctuation nopunct_sentence1 = ''.join([c for c in sentence1 if re.match("[a-z\-\' \n\t]", c)]) nopunct_sentence2 = ''.join([c for c in sentence2 if re.match("[a-z\-\' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # POS pos_line1 = pos_tag(line1) pos_line2 = pos_tag(line2) # filter line1 and line2 using POS info # only remain verbs, nouns, adverbs, adjectives filtered_line1 = [] filtered_line2 = [] for tagged_tuple in pos_line1: term = tagged_tuple[0] tag = tagged_tuple[1] # find out all verbs, nouns, adverbs, adjectives # in the meanwhile get rid of terms that do not appear in WordNet if (tag.startswith('V') or tag.startswith('N') or tag.startswith('R') or tag.startswith('J')) and wn.synsets(term): filtered_line1.append(term) for tagged_tuple in pos_line2: term = tagged_tuple[0] tag = tagged_tuple[1] # find out all verbs, nouns, adverbs, adjectives # in the meanwhile get rid of terms that do not appear in WordNet if (tag.startswith('V') or tag.startswith('N') or tag.startswith('R') or tag.startswith('J')) and wn.synsets(term): filtered_line2.append(term) # get the synset list for each sentence, containing all WordNet senses # without stopword elimination synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line1]) synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line2]) # # # with stopword elimination # # synset_list1 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line1 if x not in sw]) # # synset_list2 = reduce(lambda x,y:x+y,[wn.synsets(x) for x in filtered_line2 if x not in sw]) # get max score lists using the inter max function defined above max_score_list1 = inter_sentence_max(filtered_line1, synset_list2, metric=metric, ic=ic) max_score_list2 = inter_sentence_max(filtered_line2, synset_list1, metric=metric, ic=ic) sim = (sum(max_score_list1) + sum(max_score_list2)) / (len(max_score_list1) + len(max_score_list2)) return sim # # Test # list1 = load_sentences('data_not_sell') # list2 = load_sentences('data_sell_share') # sentence1 = list1[0] # sentence2 = list2[1] # brown_ic = wordnet_ic.ic('ic-brown.dat') # semcor_ic = wordnet_ic.ic('ic-semcor.dat') # # sim_sem_intermax(sentence1, sentence2) # score = sim_sem_intermax(sentence1, sentence2) # print 'path: ', score # score = sim_sem_intermax(sentence1, sentence2, metric=wn.lch_similarity) # print 'lch : ', score # score = sim_sem_intermax(sentence1, sentence2, metric=wn.wup_similarity) # print 'wup : ', score # score = sim_sem_intermax(sentence1, sentence2, metric=wn.res_similarity, ic=brown_ic) # print 'res - brown : ', score # score = sim_sem_intermax(sentence1, sentence2, metric=wn.res_similarity, ic=semcor_ic) # print 'res - semcor : ', score # score = sim_sem_intermax(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic) # print 'jcn : ', score # score = sim_sem_intermax(sentence1, sentence2, metric=wn.lin_similarity, ic=brown_ic) # print 'lin : ', score # Sample results: # sentence1 = list1[0] # sentence2 = list2[1] # path: 0.511742424242 # lch : 2.37924751823 # wup : 0.715648844878 # res - brown : 5.9252699315 # res - semcor : 6.82379313536 # jcn : 0.693656881745 # lin : 0.662626674403
def sim_wordorder(sentence1, sentence2, threshold=0.3, metric=wn.path_similarity, ic=None): # lowercase sentence1 = sentence1.lower() sentence2 = sentence2.lower() # remove punctuation nopunct_sentence1 = ''.join([c for c in sentence1 if re.match("[a-z\-\' \n\t]", c)]) nopunct_sentence2 = ''.join([c for c in sentence2 if re.match("[a-z\-\' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # joint list # # Note: set() method is not inplace, # # however, the calculate of word order vector does not # # require inplace set of J due to nature of vector modulus # J = list(set(line1).union(set(line2))) # print J # an inplace way to get the joint set: combined = line1 + line2 J = [] [J.append(x) for x in combined if x not in J] r1 = calculate_word_order_vector(J, line1, threshold, metric, ic) r2 = calculate_word_order_vector(J, line2, threshold, metric, ic) # Similarity calculation given word order vector r1 and r2 # transfer to array x = np.array(r1) y = np.array(r2) # difference and sum diff = x - y summ = x + y # modulus diff_modulus = np.sqrt((diff*diff).sum()) summ_modulus = np.sqrt((summ*summ).sum()) # final similarity sim = 1 - (diff_modulus/summ_modulus) return sim # # Test # # T1 = 'A quick brown dog jumps over the lazy fox.' # # T2 = 'A quick blue fox jumps over the lazy dog.' # # score = sim_wordorder(T1,T2) # # print score # # Test # list1 = load_sentences('data_not_sell') # list2 = load_sentences('data_sell_share') # sentence1 = list1[0] # sentence2 = list2[1] # brown_ic = wordnet_ic.ic('ic-brown.dat') # semcor_ic = wordnet_ic.ic('ic-semcor.dat') # # sim_wordorder(sentence1, sentence2) # score = sim_wordorder(sentence1, sentence2) # print 'path: ', score # score = sim_wordorder(sentence1, sentence2, metric=wn.lch_similarity) # print 'lch : ', score # score = sim_wordorder(sentence1, sentence2, metric=wn.wup_similarity) # print 'wup : ', score # score = sim_wordorder(sentence1, sentence2, metric=wn.res_similarity, ic=brown_ic) # print 'res - brown : ', score # score = sim_wordorder(sentence1, sentence2, metric=wn.res_similarity, ic=semcor_ic) # print 'res - semcor : ', score # score = sim_wordorder(sentence1, sentence2, metric=wn.jcn_similarity, ic=brown_ic) # print 'jcn : ', score # score = sim_wordorder(sentence1, sentence2, metric=wn.lin_similarity, ic=brown_ic) # print 'lin : ', score # # Sample results: # # sentence1 = list1[0] # # sentence2 = list2[1] # # path: 0.19306769657 # # lch : 0.250567652287 # # wup : 0.256855951338 # # res - brown : 0.252420961067 # # res - semcor : 0.252420961067 # # jcn : 0.180138258853 # # lin : 0.310539825618
text2 = "I am reading that privacy policy" texts = ["We do not rent, sell, or share any of this information with third party companies.", "We do not rent, sell, or share any information about the user with any third-parties. ", "We do not, under any circumstances, share, sell or rent your information to anyone. ", "We never share or sell your personal information", "We neither rent nor sell your Personal Information to anyone", "As a general rule, Blizzard will not forward your information to a third party without your permission."] # print extractor(text) # tokens = wt(text) # print tokens # pos_line = pos_tag(tokens) # print pos_line # tagged = pos_tagging([text]) # print tagged print wt(text) print pos_tag(wt(text)) print stemming([text]) print pos_lemmatizing([text]) print select_by_pos([text]) print negation_bigram([text]) print term_extraction([text]) print pos_bagging([text]) print pos_tagging([text]) print sem_firstsense([text]) print sem_wsd_sentence([text]) # stemmer = PorterStemmer() # lemmatizer = WordNetLemmatizer() # # terms = ["best", "better", "goods"]
def negation_bigram(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data POS tag the line, match patterns of negations to form bigram terms Return: neg_bigram_list (list of strings(terms that meets the POS criteria)) """ neg_verb_set = ['not', 'never', 'neither'] neg_noun_set = ['without'] verb_window = 10 noun_window = 3 neg_bigram_list = [] for i, line in enumerate(line_list): # linercase line = line.lower() # Having punctuation removal before POS seems to be a bad idea # # remove punctuation # # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # # line = ''.join([c for c in line # # if re.match("[a-z\-\' \n\t]", c)]) # # this solve the problem above: # line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(line) # base for return neg_bigram_line = line_token # POS pos_line = pos_tag(line_token) # first iteration to find flag words neg_verb = None neg_verb_flag = None neg_noun_flag = None for i, tagged_tuple in enumerate(pos_line): term = tagged_tuple[0] if term in neg_verb_set: neg_verb_flag = i neg_verb = term elif term in neg_noun_set: neg_noun_flag = i # second iteration to find neg_verb match and form bigram if neg_verb_flag != None: for i, tagged_tuple in enumerate(pos_line): term = tagged_tuple[0] tag = tagged_tuple[1] if (i-neg_verb_flag)<=verb_window and (i-neg_verb_flag)>0 and tag.startswith('V'): neg_bigram_line.append(neg_verb+term) # third iteration to find neg_noun match and form bigram if neg_noun_flag != None: for i, tagged_tuple in enumerate(pos_line): term = tagged_tuple[0] tag = tagged_tuple[1] if (i-neg_noun_flag)<=noun_window and (i-neg_noun_flag)>0 and tag.startswith('N'): neg_bigram_line.append("without"+term) # back to sentence as a string neg_bigram_sentence = ' '.join(neg_bigram_line) neg_bigram_list.append(neg_bigram_sentence) return neg_bigram_list