def calcMyScoreQP(sq, sp, w2z, all_questions2mystem, all_paragraphs2mystem, filter_pos={'S', 'A', 'V'}, agg_type='min', join_type='mul', matrixReturn=False): ttq = list( sorted( uniq_words( getMystemText(sq, all_questions2mystem, filter_pos=filter_pos)))) ttp = list( sorted( uniq_words( getMystemText(sp, all_paragraphs2mystem, filter_pos=filter_pos)))) if len(ttq) == 0: return 0 a = np.zeros((len(ttq), len(ttp))) for i1, t1 in enumerate(ttq): for i2, t2 in enumerate(ttp): set1 = w2z.get(t1, set()) set2 = w2z.get(t2, set()) n1 = len(set1) n2 = len(set2) n12 = len(set1 & set2) p1 = (n12 + 0.1) / (n1 + 0.2) p2 = (n12 + 0.1) / (n2 + 0.2) p12 = (n12 + 0.1) / (n1 + n2 - n12 + 0.2) if join_type == 'mul': if agg_type == 'min': a[i1, i2] = p1 * p2 elif agg_type == 'minimax': a[i1, i2] = p12 elif join_type == 'max': a[i1, i2] = max(p1, p2) else: raise a[i1, i2] = a[i1, i2]**0.5 score = 0 if agg_type == 'min': score = np.min(a) elif agg_type == 'minimax': score = np.min(np.max(a, axis=1)) else: raise if matrixReturn is False: return score return score, a, ttp, ttq
def calcW2Spell(spel_cor, questions): if not os.path.exists('data/spell'): os.mkdir('data/spell') if not os.path.exists('data/spell/w2spell.txt'): w2spel = dict() for q in tqdm.tqdm_notebook(questions, total=len(questions)): for w in uniq_words(q): if w not in w2spel: wcor, cands = spel_cor.correct(w) w2spel[w] = [wcor, cands] if wcor != w: print(w, wcor, cands[:min(len(cands), 5)]) with open('data/spell/w2spell.txt', 'w', encoding='utf-8') as fout: for w in sorted(w2spel): wcor, cands = w2spel[w] if wcor != w: fout.write(w + '\t' + wcor + '\t' + '\t'.join( [';'.join([str(ee) for ee in e]) for e in cands]) + '\n') w2spel = dict() with open('data/spell/w2spell.txt', encoding='utf-8') as fin: for line in fin: tokens = line.split('\t') w = tokens[0] wcor = tokens[1] cands = [] for e in tokens[2:]: arr = e.split(';') cands.append([arr[0], int(arr[1]), int(arr[2])]) w2spel[w] = [wcor, cands] return w2spel
def calcMyScoreQP(self, ssq, ssp, all_questions2mystem, all_paragraphs2mystem, filter_pos={'S', 'A'}, agg_type='min', matrixReturn=False): ttq = list( sorted( uniq_words( getMystemText(ssq, all_questions2mystem, filter_pos=filter_pos)))) ttp = list( sorted( uniq_words( getMystemText(ssp, all_paragraphs2mystem, filter_pos=filter_pos)))) ttq = [e for e in ttq if e in self.w2v] ttp = [e for e in ttp if e in self.w2v] if len(ttq) == 0 or len(ttp) == 0: return 0 twq = [self.w2v[e] for e in ttq] twp = [self.w2v[e] for e in ttp] a = np.zeros((len(ttq), len(ttp))) for i1, t1 in enumerate(ttq): for i2, t2 in enumerate(ttp): if t1 == t2: a[i1, i2] = 1 else: a[i1, i2] = 1 - sp.spatial.distance.cosine(twq[i1], twp[i2]) score = 0 if agg_type == 'min': score = np.min(a) elif agg_type == 'minimax': score = np.min(np.max(a, axis=1)) elif agg_type == 'max': score = np.max(a) else: raise if matrixReturn is False: return score return score, a, ttp, ttq
def calcMyScore(ss, w2z, all_questions2mystem, filter_pos={'S', 'A', 'V'}, agg_type='min', matrixReturn=False): tt = list( sorted( uniq_words( getMystemText(ss, all_questions2mystem, filter_pos=filter_pos)))) if len(tt) == 0: return 0 a = np.zeros((len(tt), len(tt))) for i1, t1 in enumerate(tt): for i2, t2 in enumerate(tt): if i1 == i2: a[i1, i2] = 1 continue if i1 <= i2: set1 = w2z.get(t1, set()) set2 = w2z.get(t2, set()) a[i1, i2] = ((len(set1 & set2) + 0.1)**2) / ((len(set1) + 0.2) * (len(set2) + 0.2)) a[i1, i2] = a[i1, i2]**0.5 a[i2, i1] = a[i1, i2] score = 0 if agg_type == 'min': score = np.min(a) elif agg_type == 'minimax': for i in range(len(tt)): a[i, i] = 0 score = np.min(np.max(a, axis=1)) else: raise if matrixReturn is False: return score return score, a, tt
def calcW2vScore(self, ss, all_questions2mystem, filter_pos={'S', 'A'}, agg_type='max', matrixReturn=False): tt = list( sorted( uniq_words( getMystemText(ss, all_questions2mystem, filter_pos=filter_pos)))) tt = [e for e in tt if e in self.w2v] if len(tt) == 0: return 0 a = np.zeros((len(tt), len(tt))) for i1, t1 in enumerate(tt): for i2, t2 in enumerate(tt): if i1 == i2: a[i1, i2] = 0 continue if i1 <= i2: a[i1, i2] = 1 - sp.spatial.distance.cosine( self.w2v[t1], self.w2v[t2]) a[i2, i1] = a[i1, i2] score = 0 if agg_type == 'min': for i in range(len(tt)): a[i, i] = 1 score = np.min(a) elif agg_type == 'minimax': score = np.min(np.max(a, axis=1)) elif agg_type == 'max': score = np.max(a) else: raise if matrixReturn is False: return score return score, a, tt
def calcMyScore2(ss, w2z, all_questions2mystem, filter_pos={'S'}, agg_type='min', w2z_sum_idfs=None, idfs_words=None, join_type='mul', matrixReturn=False): tt = list( sorted( uniq_words( getMystemText(ss, all_questions2mystem, filter_pos=filter_pos)))) if len(tt) == 0: return 0 a = np.zeros((len(tt), len(tt))) sets = list() for t1 in tt: sets.append(set(w2z.get(t1, dict()).keys())) for i1, t1 in enumerate(tt): for i2, t2 in enumerate(tt): if i1 == i2: a[i1, i2] = 1 continue if i1 <= i2: set1 = sets[i1] set2 = sets[i2] if idfs_words is None: n1 = len(set1) n2 = len(set2) n12 = len(set1 & set2) else: n1 = w2z_sum_idfs.get(t1, 0.0) n2 = w2z_sum_idfs.get(t2, 0.0) n12 = 0.0 for ttt in set1 & set2: n12 += idfs_words.get(ttt, 0.0) p1 = (n12 + 0.1) / (n1 + 0.2) p2 = (n12 + 0.1) / (n2 + 0.2) p12 = (n12 + 0.1) / (n1 + n2 - n12 + 0.2) if join_type == 'mul': if agg_type == 'min': a[i1, i2] = p1 * p2 elif agg_type == 'minimax': a[i1, i2] = p12 elif join_type == 'max': a[i1, i2] = max(p1, p2) else: raise a[i1, i2] = a[i1, i2]**0.5 a[i2, i1] = a[i1, i2] score = 0 if agg_type == 'min': score = np.min(a) elif agg_type == 'minimax': for i in range(len(tt)): a[i, i] = 0 score = np.min(np.max(a, axis=1)) else: raise if matrixReturn is False: return score return score, a, tt