def jac(): l = [] df = word_shear() q1 = dt.getQues1(df) q2 = dt.getQues2(df) lemmatizer = WordNetLemmatizer() for i in range(len(q1)): word_list = nltk.word_tokenize(q1[i]) word_list2 = nltk.word_tokenize(q2[i]) lemmatized_output = [lemmatizer.lemmatize(w) for w in word_list] lemmatized_output2 = [lemmatizer.lemmatize(w) for w in word_list2] count = 0 if len(lemmatized_output) >= len(lemmatized_output2): for j in range(len(lemmatized_output2)): if lemmatized_output2[j] == lemmatized_output[j]: count += 1 else: for j in range(len(lemmatized_output)): if lemmatized_output[j] == lemmatized_output2[j]: count += 1 z = round( float(count) / (len(lemmatized_output) + len(lemmatized_output2) - count), 2) l.append(z) df.insert(14, 'jakard', " ") x = dt.setjakard(df, l) return df
def compareDiffWords(): l = [] tokenizer = RegexpTokenizer(r'\w+') df = compareSameWords() q1=dt.getQues1(df) q2=dt.getQues2(df) for i in range(len(q1)): x = tokenizer.tokenize(q1[i]) y = tokenizer.tokenize(q2[i]) count = 0 if len(x) >= len(y): for j in range(len(y)): if y[j] != x[j]: count += 1 count += (len(x) - len(y)) l.append(count) else: for j in range(len(x)): if x[j] != y[j]: count +=1 count += (len(y) - len(x)) l.append(count) df.insert(10, 'diffwords', " ") dt.setcdw(df,l) return df
def punc(): l = [] m = [] df= lower() x = dt.getQues1(df) y = dt.getQues2(df) tokenizer = RegexpTokenizer(r'\w+') for i in range(len(x)): x[i] = tokenizer.tokenize(x[i]) y[i] = tokenizer.tokenize(y[i]) if x[i] == l or y[i] == l: x[i] = " ".join(x[i]) y[i] = " ".join(y[i]) m.append(i) else: x[i] = " ".join(x[i]) y[i] = " ".join(y[i]) x = list(x) for index in sorted(m, reverse=True): del x[index] df = df.drop(df.index[m]) dt.setQues1(df,x) dt.setQues2(df,y) print('punc') return df
def lower(): df = dt.readfile() x = dt.getQues1(df) y = dt.getQues2(df) for i in range(len(x)): x[i] = x[i].lower() y[i] = y[i].lower() dt.setQues1(df,x) dt.setQues2(df,y) print('lower') return df
def Fuz_sort(): l = [] df = Fuz() q1 = dt.getQues1(df) q2 = dt.getQues2(df) for i in range(len(q1)): a = fuzz.token_sort_ratio(q1[i], q2[i]) a = a / 100 l.append(a) df.insert(12, 'sortedfuzzy', " ") x = dt.setfuzzsort(df, l) return df
def Fuz(): df = bf.compareDiffWords() q1 = dt.getQues1(df) q2 = dt.getQues2(df) l = [] for i in range(len(q1)): a = fuzz.ratio(q1[i], q2[i]) a = a / 100 l.append(a) df.insert(11, 'fuzzywuzzy', " ") x = dt.setfuzz(df, l) return df
def leng(): l = [] m = [] df = pre.punc() x = dt.getQues1(df) y = dt.getQues2(df) for i in range(len(x)): l.append(len(x[i])) m.append(len(y[i])) df.insert(5, 'len q1', " ") df.insert(6, 'len q2', " ") dt.setlen1(df,l) dt.setlen2(df,m) return df
def levention(): l = [] df = jac() q1 = dt.getQues1(df) q2 = dt.getQues2(df) levenshtein = Levenshtein() for i in range(len(q1)): w1 = q1[i] w2 = q2[i] z = levenshtein.distance(w1, w2) l.append(z) df.insert(15, 'levenshtein', " ") x = dt.setleven(df, l) return df
def word_shear(): l = [] df = Fuz_sort() q1 = dt.getQues1(df) q2 = dt.getQues2(df) tokenizer = RegexpTokenizer(r'\w+') for i in range(len(q1)): w1 = tokenizer.tokenize(q1[i]) w2 = tokenizer.tokenize(q2[i]) f = ((1.0) * len(w1 and w2)) / (len(w1) + len(w2)) l.append(round(f, 2)) df.insert(13, 'word_shear', " ") x = dt.setwordshear(df, l) return df
def compare(): l =[] tokenizer = RegexpTokenizer(r'\w+') df = leng_diff() q1=dt.getQues1(df) q2=dt.getQues2(df) for i in range(len(q1)): x = tokenizer.tokenize(q1[i]) y = tokenizer.tokenize(q2[i]) if x[0] == y[0]: l.append(1) else: l.append(0) df.insert(8, 'fcw', " ") dt.setfcw(df,l) return df
def tf_idf(): l = [] df = levention() q1 = dt.getQues1(df) q2 = dt.getQues2(df) for i in range(len(q1)): l.append(q1[i]) l.append(q2[i]) vectorizer = TfidfVectorizer() vectorizer.fit(l) m = [] n = [] for i in range(len(q1)): m.append(vectorizer.transform([q1[i]])) n.append(vectorizer.transform([q2[i]])) p = cs(m, n) q = eq(m, n) r = pwm(m, n) s = pwl(m, n) t = mink(m, n) u = canb(m, n) df.insert(16, 'cs', " ") df.insert(17, 'eq', " ") df.insert(18, 'pwm', " ") df.insert(19, 'pwl', " ") df.insert(20, 'mink', " ") df.insert(21, 'canb', " ") x = dt.setcs(df, p) y = dt.seteq(df, q) z = dt.setpwm(df, r) a = dt.setpwl(df, s) b = dt.setmink(df, t) c = dt.setcanb(df, u) df[[ 'id', 'qid1', 'qid2', 'question1', 'question2', 'len q1', 'len q2', 'len_diff', 'fcw', 'samewords', 'diffwords', 'fuzzywuzzy', 'sortedfuzzy', 'word_shear', 'jakard', 'levenshtein', 'cs', 'eq', 'pwm', 'pwl', 'mink', 'canb' ]] return df