Exemple #1
0
def calcMyScoreQP(sq,
                  sp,
                  w2z,
                  all_questions2mystem,
                  all_paragraphs2mystem,
                  filter_pos={'S', 'A', 'V'},
                  agg_type='min',
                  join_type='mul',
                  matrixReturn=False):
    ttq = list(
        sorted(
            uniq_words(
                getMystemText(sq, all_questions2mystem,
                              filter_pos=filter_pos))))
    ttp = list(
        sorted(
            uniq_words(
                getMystemText(sp, all_paragraphs2mystem,
                              filter_pos=filter_pos))))
    if len(ttq) == 0:
        return 0
    a = np.zeros((len(ttq), len(ttp)))
    for i1, t1 in enumerate(ttq):
        for i2, t2 in enumerate(ttp):
            set1 = w2z.get(t1, set())
            set2 = w2z.get(t2, set())
            n1 = len(set1)
            n2 = len(set2)
            n12 = len(set1 & set2)
            p1 = (n12 + 0.1) / (n1 + 0.2)
            p2 = (n12 + 0.1) / (n2 + 0.2)
            p12 = (n12 + 0.1) / (n1 + n2 - n12 + 0.2)
            if join_type == 'mul':
                if agg_type == 'min':
                    a[i1, i2] = p1 * p2
                elif agg_type == 'minimax':
                    a[i1, i2] = p12
            elif join_type == 'max':
                a[i1, i2] = max(p1, p2)
            else:
                raise
            a[i1, i2] = a[i1, i2]**0.5
    score = 0
    if agg_type == 'min':
        score = np.min(a)
    elif agg_type == 'minimax':
        score = np.min(np.max(a, axis=1))
    else:
        raise

    if matrixReturn is False:
        return score
    return score, a, ttp, ttq
Exemple #2
0
def calcW2Spell(spel_cor, questions):
    if not os.path.exists('data/spell'):
        os.mkdir('data/spell')

    if not os.path.exists('data/spell/w2spell.txt'):
        w2spel = dict()
        for q in tqdm.tqdm_notebook(questions, total=len(questions)):
            for w in uniq_words(q):
                if w not in w2spel:
                    wcor, cands = spel_cor.correct(w)
                    w2spel[w] = [wcor, cands]
                    if wcor != w:
                        print(w, wcor, cands[:min(len(cands), 5)])

        with open('data/spell/w2spell.txt', 'w', encoding='utf-8') as fout:
            for w in sorted(w2spel):
                wcor, cands = w2spel[w]
                if wcor != w:
                    fout.write(w + '\t' + wcor + '\t' + '\t'.join(
                        [';'.join([str(ee) for ee in e])
                         for e in cands]) + '\n')

    w2spel = dict()
    with open('data/spell/w2spell.txt', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.split('\t')
            w = tokens[0]
            wcor = tokens[1]
            cands = []
            for e in tokens[2:]:
                arr = e.split(';')
                cands.append([arr[0], int(arr[1]), int(arr[2])])
            w2spel[w] = [wcor, cands]
    return w2spel
Exemple #3
0
    def calcMyScoreQP(self,
                      ssq,
                      ssp,
                      all_questions2mystem,
                      all_paragraphs2mystem,
                      filter_pos={'S', 'A'},
                      agg_type='min',
                      matrixReturn=False):
        ttq = list(
            sorted(
                uniq_words(
                    getMystemText(ssq,
                                  all_questions2mystem,
                                  filter_pos=filter_pos))))
        ttp = list(
            sorted(
                uniq_words(
                    getMystemText(ssp,
                                  all_paragraphs2mystem,
                                  filter_pos=filter_pos))))
        ttq = [e for e in ttq if e in self.w2v]
        ttp = [e for e in ttp if e in self.w2v]
        if len(ttq) == 0 or len(ttp) == 0:
            return 0
        twq = [self.w2v[e] for e in ttq]
        twp = [self.w2v[e] for e in ttp]
        a = np.zeros((len(ttq), len(ttp)))
        for i1, t1 in enumerate(ttq):
            for i2, t2 in enumerate(ttp):
                if t1 == t2:
                    a[i1, i2] = 1
                else:
                    a[i1,
                      i2] = 1 - sp.spatial.distance.cosine(twq[i1], twp[i2])
        score = 0
        if agg_type == 'min':
            score = np.min(a)
        elif agg_type == 'minimax':
            score = np.min(np.max(a, axis=1))
        elif agg_type == 'max':
            score = np.max(a)
        else:
            raise

        if matrixReturn is False:
            return score
        return score, a, ttp, ttq
Exemple #4
0
def calcMyScore(ss,
                w2z,
                all_questions2mystem,
                filter_pos={'S', 'A', 'V'},
                agg_type='min',
                matrixReturn=False):
    tt = list(
        sorted(
            uniq_words(
                getMystemText(ss, all_questions2mystem,
                              filter_pos=filter_pos))))
    if len(tt) == 0:
        return 0
    a = np.zeros((len(tt), len(tt)))
    for i1, t1 in enumerate(tt):
        for i2, t2 in enumerate(tt):
            if i1 == i2:
                a[i1, i2] = 1
                continue
            if i1 <= i2:
                set1 = w2z.get(t1, set())
                set2 = w2z.get(t2, set())
                a[i1,
                  i2] = ((len(set1 & set2) + 0.1)**2) / ((len(set1) + 0.2) *
                                                         (len(set2) + 0.2))
                a[i1, i2] = a[i1, i2]**0.5
                a[i2, i1] = a[i1, i2]
    score = 0
    if agg_type == 'min':
        score = np.min(a)
    elif agg_type == 'minimax':
        for i in range(len(tt)):
            a[i, i] = 0
        score = np.min(np.max(a, axis=1))
    else:
        raise
    if matrixReturn is False:
        return score
    return score, a, tt
Exemple #5
0
 def calcW2vScore(self,
                  ss,
                  all_questions2mystem,
                  filter_pos={'S', 'A'},
                  agg_type='max',
                  matrixReturn=False):
     tt = list(
         sorted(
             uniq_words(
                 getMystemText(ss,
                               all_questions2mystem,
                               filter_pos=filter_pos))))
     tt = [e for e in tt if e in self.w2v]
     if len(tt) == 0:
         return 0
     a = np.zeros((len(tt), len(tt)))
     for i1, t1 in enumerate(tt):
         for i2, t2 in enumerate(tt):
             if i1 == i2:
                 a[i1, i2] = 0
                 continue
             if i1 <= i2:
                 a[i1, i2] = 1 - sp.spatial.distance.cosine(
                     self.w2v[t1], self.w2v[t2])
                 a[i2, i1] = a[i1, i2]
     score = 0
     if agg_type == 'min':
         for i in range(len(tt)):
             a[i, i] = 1
         score = np.min(a)
     elif agg_type == 'minimax':
         score = np.min(np.max(a, axis=1))
     elif agg_type == 'max':
         score = np.max(a)
     else:
         raise
     if matrixReturn is False:
         return score
     return score, a, tt
Exemple #6
0
def calcMyScore2(ss,
                 w2z,
                 all_questions2mystem,
                 filter_pos={'S'},
                 agg_type='min',
                 w2z_sum_idfs=None,
                 idfs_words=None,
                 join_type='mul',
                 matrixReturn=False):
    tt = list(
        sorted(
            uniq_words(
                getMystemText(ss, all_questions2mystem,
                              filter_pos=filter_pos))))
    if len(tt) == 0:
        return 0
    a = np.zeros((len(tt), len(tt)))
    sets = list()
    for t1 in tt:
        sets.append(set(w2z.get(t1, dict()).keys()))

    for i1, t1 in enumerate(tt):
        for i2, t2 in enumerate(tt):
            if i1 == i2:
                a[i1, i2] = 1
                continue
            if i1 <= i2:
                set1 = sets[i1]
                set2 = sets[i2]
                if idfs_words is None:
                    n1 = len(set1)
                    n2 = len(set2)
                    n12 = len(set1 & set2)
                else:
                    n1 = w2z_sum_idfs.get(t1, 0.0)
                    n2 = w2z_sum_idfs.get(t2, 0.0)
                    n12 = 0.0
                    for ttt in set1 & set2:
                        n12 += idfs_words.get(ttt, 0.0)
                p1 = (n12 + 0.1) / (n1 + 0.2)
                p2 = (n12 + 0.1) / (n2 + 0.2)
                p12 = (n12 + 0.1) / (n1 + n2 - n12 + 0.2)
                if join_type == 'mul':
                    if agg_type == 'min':
                        a[i1, i2] = p1 * p2
                    elif agg_type == 'minimax':
                        a[i1, i2] = p12
                elif join_type == 'max':
                    a[i1, i2] = max(p1, p2)
                else:
                    raise

                a[i1, i2] = a[i1, i2]**0.5
                a[i2, i1] = a[i1, i2]

    score = 0
    if agg_type == 'min':
        score = np.min(a)
    elif agg_type == 'minimax':
        for i in range(len(tt)):
            a[i, i] = 0
        score = np.min(np.max(a, axis=1))
    else:
        raise
    if matrixReturn is False:
        return score
    return score, a, tt