Esempio n. 1
0
def cut_query(fn, fn_out):
    with open(fn_out, 'w') as fout:
        with open(fn) as fin:
            for line in fin:
                data = json.loads(line, encoding='utf8')
                cut_data = text_cutter.process({'title': data['query'].encode('utf8')})
                data['cut_query'] = cut_data['cut_title'].decode('utf8')
                data.pop('query')
                print >> fout, json.dumps(data, ensure_ascii=False).encode('utf8')
Esempio n. 2
0
    def get_k_nearest_title(self, title, k):
        if isinstance(title, unicode):
            title = title.encode('utf8')

        cut_data = text_cutter.process({'title': title})
        title = cut_data['cut_title'].decode('utf8')
        vecs, valid_titles = self.model.get_title_vec([title])
        if len(valid_titles) == 0:
            return []
        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]
        k_neighbors, scores = self.title_ann.get_nns_by_vector(vec, n=k, include_distances=True)
        neighbors = []
        for i in k_neighbors:
            neighbors.append(self.titles[i])
        return sorted(zip(neighbors, scores), key=lambda x: x[-1])
Esempio n. 3
0
def get_news_from_gid_file(fn_id, fn_out):

    gids = []
    with open(fn_id) as fin:
        for line in fin:
            gids.append(int(line.strip()))

    sql = "select id, title from ss_article_group where id=%s;"
    with open(fn_out, 'w') as fout:
        for gid in gids:
            groupdb_dal.execute(sql, gid)
            row = groupdb_dal.cursor.fetchone()
            if row == None or row['title']==None:
                continue
            if len(row['title']) == 0:
                continue
            cut_data = text_cutter.process({'title': row['title'].encode('utf8')})

            row['cut_title'] = cut_data['cut_title'].decode('utf8')
            print >> fout, json.dumps(row, ensure_ascii=False).encode('utf8')
Esempio n. 4
0
def get_title_for_query_file(fn_combine, fn_exist_titles_list, fn_out):

    gids = set()
    for fn_title in fn_exist_titles_list:
        with open(fn_title) as fin:
            for line in fin:
                try:
                    gid = int(line.split('\t')[0])
                    gids.add(gid)
                except ValueError as e:
                    print e
                    print line.strip()

    sql = "select id, title from ss_article_group where id=%s;"
    lno = 0
    with open(fn_out, 'w') as fout:
        with open(fn_combine) as fin:
            for line in fin:
                if lno % 1000 == 0:
                    sys.stdout.write('process to %d\r' % lno)
                    sys.stdout.flush()
                lno += 1
                data = json.loads(line, encoding='utf8')
                for c, r in data['title']:
                    if c not in gids:
                        groupdb_dal.execute(sql, c)
                        row = groupdb_dal.cursor.fetchone()
                        if row == None or row['title'] == None:
                            continue
                        if len(row['title']) == 0:
                            continue
                        cut_data = text_cutter.process(
                            {'title': row['title'].encode('utf8')})

                        row['cut_title'] = cut_data['cut_title'].decode('utf8')
                        row['cut_title'] = row['cut_title'].replace(
                            '\n', '').replace('\r', '')
                        print >> fout, ("%s\t%s" %
                                        (c, row['cut_title'])).encode('utf8')
                        gids.add(c)
Esempio n. 5
0
def get_title_for_query_file(fn_combine, fn_exist_titles_list, fn_out):

    gids = set()
    for fn_title in fn_exist_titles_list:
        with open(fn_title) as fin:
            for line in fin:
                try:
                    data = json.loads(line, encoding='utf8')
                    gid = data['id']
                    gids.add(gid)
                except ValueError as e:
                    print e
                    print line.strip()


    sql = "select id, title from ss_article_group where id=%s;"
    lno = 0
    with open(fn_out, 'w') as fout:
        with open(fn_combine) as fin:
            for line in fin:
                if lno % 1000 == 0:
                    sys.stdout.write('process to %d\r' % lno)
                    sys.stdout.flush()
                lno += 1
                data = json.loads(line, encoding='utf8')
                for c, r in data['title']:
                    if c not in gids:
                        groupdb_dal.execute(sql, c)
                        row = groupdb_dal.cursor.fetchone()
                        if row == None or row['title']==None:
                            continue
                        if len(row['title']) == 0:
                            continue
                        cut_data = text_cutter.process({'title': row['title'].encode('utf8')})

                        row['cut_title'] = cut_data['cut_title'].decode('utf8')
                        row['cut_title'] = row['cut_title'].replace('\n', '').replace('\r', '')
                        # print >> fout, ("%s\t%s" % (c, row['cut_title'])).encode('utf8')
                        print >> fout, json.dumps(row, ensure_ascii=False).encode('utf8')
                        gids.add(c)
Esempio n. 6
0
    def get_answers(self, query, k):
        if isinstance(query, unicode):
            query = query.encode('utf8')

        cut_data = text_cutter.process({'title': query})
        cut_query = cut_data['cut_title'].decode('utf8')
        vecs, valid_queries = self.model.get_query_vec([cut_query])
        if len(valid_queries)==0:
            return []

        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]
        # recall titles according to cosine similarity
        candidate_titles_index, scores = self.title_ann.get_nns_by_vector(vec, n=k*10, include_distances=True)

        # rank candidate titles using model
        candidate_titles = []
        for i in candidate_titles_index:
            candidate_titles.append(self.titles[i])

        ranks = self.model.rank_titles(cut_query, candidate_titles)[:k]
        return ranks