def jac():
    l = []
    df = word_shear()
    q1 = dt.getQues1(df)
    q2 = dt.getQues2(df)
    lemmatizer = WordNetLemmatizer()
    for i in range(len(q1)):
        word_list = nltk.word_tokenize(q1[i])
        word_list2 = nltk.word_tokenize(q2[i])
        lemmatized_output = [lemmatizer.lemmatize(w) for w in word_list]
        lemmatized_output2 = [lemmatizer.lemmatize(w) for w in word_list2]
        count = 0
        if len(lemmatized_output) >= len(lemmatized_output2):
            for j in range(len(lemmatized_output2)):
                if lemmatized_output2[j] == lemmatized_output[j]:
                    count += 1
        else:
            for j in range(len(lemmatized_output)):
                if lemmatized_output[j] == lemmatized_output2[j]:
                    count += 1
        z = round(
            float(count) /
            (len(lemmatized_output) + len(lemmatized_output2) - count), 2)
        l.append(z)
    df.insert(14, 'jakard', " ")
    x = dt.setjakard(df, l)
    return df
コード例 #2
0
def compareDiffWords():
	l = []
	tokenizer = RegexpTokenizer(r'\w+')
	df = compareSameWords()
	q1=dt.getQues1(df)
	q2=dt.getQues2(df)
	for i in range(len(q1)):
		x = tokenizer.tokenize(q1[i])
		y = tokenizer.tokenize(q2[i])
		count = 0
		if len(x) >= len(y):
			for j in range(len(y)):
				if y[j] != x[j]:
					count += 1
			count += (len(x) - len(y))
			l.append(count)
		else:
			for j in range(len(x)):
				if x[j] != y[j]:
					count +=1
			count += (len(y) - len(x))
			l.append(count)
	df.insert(10, 'diffwords', " ")
	dt.setcdw(df,l)
	return df
コード例 #3
0
def punc():
	l = []
	m = []
	df= lower()
	x = dt.getQues1(df)
	y = dt.getQues2(df) 
	tokenizer = RegexpTokenizer(r'\w+')
	for i in range(len(x)):
		x[i] = tokenizer.tokenize(x[i])
		y[i] = tokenizer.tokenize(y[i])
		if x[i] == l or y[i] == l:
			x[i] = " ".join(x[i])
			y[i] = " ".join(y[i])
			m.append(i)
		else:
			x[i] = " ".join(x[i])
			y[i] = " ".join(y[i])
	x = list(x)
	for index in sorted(m, reverse=True):
		del x[index]
	df = df.drop(df.index[m])
	dt.setQues1(df,x)
	dt.setQues2(df,y)
	print('punc')
	return df
コード例 #4
0
def lower():
	df = dt.readfile()
	x = dt.getQues1(df)
	y = dt.getQues2(df)
	for i in range(len(x)):
		x[i] = x[i].lower()
		y[i] = y[i].lower()
	dt.setQues1(df,x)
	dt.setQues2(df,y)
	print('lower')
	return df
def Fuz_sort():
    l = []
    df = Fuz()
    q1 = dt.getQues1(df)
    q2 = dt.getQues2(df)
    for i in range(len(q1)):
        a = fuzz.token_sort_ratio(q1[i], q2[i])
        a = a / 100
        l.append(a)
    df.insert(12, 'sortedfuzzy', " ")
    x = dt.setfuzzsort(df, l)
    return df
def Fuz():
    df = bf.compareDiffWords()
    q1 = dt.getQues1(df)
    q2 = dt.getQues2(df)
    l = []
    for i in range(len(q1)):
        a = fuzz.ratio(q1[i], q2[i])
        a = a / 100
        l.append(a)
    df.insert(11, 'fuzzywuzzy', " ")
    x = dt.setfuzz(df, l)
    return df
コード例 #7
0
def leng():
	l = []
	m = []
	df = pre.punc()
	x = dt.getQues1(df)
	y = dt.getQues2(df)
	for i in range(len(x)):
		l.append(len(x[i]))
		m.append(len(y[i]))	
	df.insert(5, 'len q1', " ")
	df.insert(6, 'len q2', " ")
	dt.setlen1(df,l)
	dt.setlen2(df,m)
	return df
def levention():
    l = []
    df = jac()
    q1 = dt.getQues1(df)
    q2 = dt.getQues2(df)
    levenshtein = Levenshtein()
    for i in range(len(q1)):
        w1 = q1[i]
        w2 = q2[i]
        z = levenshtein.distance(w1, w2)
        l.append(z)
    df.insert(15, 'levenshtein', " ")
    x = dt.setleven(df, l)
    return df
def word_shear():
    l = []
    df = Fuz_sort()
    q1 = dt.getQues1(df)
    q2 = dt.getQues2(df)
    tokenizer = RegexpTokenizer(r'\w+')
    for i in range(len(q1)):
        w1 = tokenizer.tokenize(q1[i])
        w2 = tokenizer.tokenize(q2[i])
        f = ((1.0) * len(w1 and w2)) / (len(w1) + len(w2))
        l.append(round(f, 2))
    df.insert(13, 'word_shear', " ")
    x = dt.setwordshear(df, l)
    return df
コード例 #10
0
def compare():
	l =[]
	tokenizer = RegexpTokenizer(r'\w+')
	df = leng_diff()
	q1=dt.getQues1(df)
	q2=dt.getQues2(df)
	for i in range(len(q1)):
		x = tokenizer.tokenize(q1[i])
		y = tokenizer.tokenize(q2[i])
		if x[0] == y[0]:
			l.append(1)
		else:
			l.append(0)
	df.insert(8, 'fcw', " ")
	dt.setfcw(df,l)
	return df
def tf_idf():
    l = []
    df = levention()
    q1 = dt.getQues1(df)
    q2 = dt.getQues2(df)
    for i in range(len(q1)):
        l.append(q1[i])
        l.append(q2[i])
    vectorizer = TfidfVectorizer()
    vectorizer.fit(l)
    m = []
    n = []
    for i in range(len(q1)):
        m.append(vectorizer.transform([q1[i]]))
        n.append(vectorizer.transform([q2[i]]))
    p = cs(m, n)
    q = eq(m, n)
    r = pwm(m, n)
    s = pwl(m, n)
    t = mink(m, n)
    u = canb(m, n)
    df.insert(16, 'cs', " ")
    df.insert(17, 'eq', " ")
    df.insert(18, 'pwm', " ")
    df.insert(19, 'pwl', " ")
    df.insert(20, 'mink', " ")
    df.insert(21, 'canb', " ")
    x = dt.setcs(df, p)
    y = dt.seteq(df, q)
    z = dt.setpwm(df, r)
    a = dt.setpwl(df, s)
    b = dt.setmink(df, t)
    c = dt.setcanb(df, u)
    df[[
        'id', 'qid1', 'qid2', 'question1', 'question2', 'len q1', 'len q2',
        'len_diff', 'fcw', 'samewords', 'diffwords', 'fuzzywuzzy',
        'sortedfuzzy', 'word_shear', 'jakard', 'levenshtein', 'cs', 'eq',
        'pwm', 'pwl', 'mink', 'canb'
    ]]
    return df