예제 #1
0
파일: main.py 프로젝트: filip-halt/bootcamp
async def do_get_question_api(question: str):
    if not question:
        return {'status': False, 'msg': 'Please enter the query.'}
    if question:
        try:
            # user_id = 'qa_' + user_id
            conn = connect_postgres_server()
            cursor = conn.cursor()
            client = milvus_client()

            bc = BertClient(ip=BERT_HOST, port=BERT_PORT, check_length=False)

            output = get_similar_question(question, client, conn, cursor, bc)
            if output:
                return {'status': True, 'msg': output}
            else:
                return {
                    'status': False,
                    'msg': 'No similar questions in the database'
                }
        except Exception as e:
            print('search faild: ', e)
            return {
                'status': False,
                'msg': 'Failed to search, please try again.'
            }
        finally:
            cursor.close()
            conn.close()
            bc.close()
    return {'status': False, 'msg': 'Failed to search, please try again.'}
class FeatureExtractor:
    """Uses Bert-as-a-Server to set up a BertClient and embed text in a Document.

        Attributes:
            document (Document): This object encompasses the extracted text from one of the
                PDF documents. There is an encoding field on each Line which is where the
                embedding from BERT will be included, and where the text that gets encoded will
                be provided.
            _bc (BertClient): Connection to the BertServer which can be used for encoding.

    """
    def __init__(self, document):
        self._document = document
        self._bc = BertClient()

    def encode(self):
        """ encodes the text in the Document object, and then adds it to the encoding attribute """
        text_lines = [line.text for line in self._document.lines]
        encodings = self._bc.encode(text_lines)
        for (line, encoding) in zip(self._document.lines, encodings):
            line.encoding = encoding
        return self._document

    def end(self):
        """ Closes the BertClient connection to BertServer """
        self._bc.close()
예제 #3
0
파일: main.py 프로젝트: filip-halt/bootcamp
async def do_load_api(file: UploadFile = File(...)):
    try:
        text = await file.read()
        fname = file.filename
        dirs = "QA_data/"
        if not os.path.exists(dirs):
            os.makedirs(dirs)
        fname_path = dirs + "/" + fname
        with open(fname_path, 'wb') as f:
            f.write(text)
    except Exception as e:
        return {'status': False, 'msg': 'Failed to load data.'}
    try:
        conn = connect_postgres_server()
        cursor = conn.cursor()
        client = milvus_client()
        bc = BertClient(ip=BERT_HOST, port=BERT_PORT, check_length=False)
        status, message = load_data(fname_path, client, conn, cursor, bc)
        return {'status': status, 'msg': message}
    except Exception as e:
        print("load data faild: ", e)
        return {'status': False, 'msg': 'Failed to load data.'}
    finally:
        cursor.close()
        conn.close()
        bc.close()
def save_emb():

    common = [
        '-model_dir',
        '/home/ydu/BERT/uncased_L-12_H-768_A-12/',
        '-num_worker',
        '2',
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-max_seq_len',
        '128',
        '-max_batch_size',
        '256',
        # '-tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data/',
        # '-ckpt_name', 'model.ckpt-2500',
    ]

    args = get_args_parser().parse_args(common)

    # folder = ['books', 'dvd', 'electronics', 'kitchen']
    data_path = '/home/ydu/BERT/DATA/'
    data_folder = ['metacritic', 'imdb', 'amazon', 'reddit']

    # model_path = 'home/ydu/BERT/bert_mgpu/results/'
    # model_folder = 'amazon-balanced/'
    # model_type = 'bert-tune'
    data = {}

    # setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/reddit-pretrain')
    # setattr(args, 'ckpt_name', 'model.ckpt-2500')
    setattr(args, 'tuned_model_dir',
            '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data')
    setattr(args, 'ckpt_name', 'model.ckpt-2500')

    for d in data_folder:
        fn = data_path + d + '/all.tsv'
        print("===========", fn, "================")
        text = read_tsv(fn)
        server = BertServer(args)
        server.start()
        print('wait until server is ready...')
        time.sleep(20)
        print('encoding...')
        bc = BertClient()
        data[d] = bc.encode(text)
        bc.close()
        server.close()

    pickle_name = data_path + 'EMB/allpre_emb.pickle'
    with open(pickle_name, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return pickle_name
예제 #5
0
async def do_insert_api(data_path: str):
    try:
        conn, cursor = init_conn()
        bc = BertClient(ip=BERT_HOST, port=BERT_PORT, check_length=False)
        status = do_insert(data_path, index_client, conn, cursor, bc)
        return "{0}".format(status)
    except Exception as e:
        return "{0}".format(e)
    finally:
        cursor.close()
        conn.close()
        bc.close()
예제 #6
0
class BertEmbedd:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)

    def get_connection(self, inport=8000, outport=8010):
        '''
        Sets up a connection with the specified BERT server api.
        :param inport: Port for pushing data from client to server (defaults to 8000)
        :param outport: Port for publishing results from server to client (defaults to 8010)
        :return: Bert client object connected to a BertServer
        '''
        try:
            self.connection = BertClient(port=inport, port_out=outport)
        except Exception as e:
            logger.error(f'Connection to BERT server failed: {str(e)}')

        logger.info("Connection to BERT server was successful.")

    def close_connection(self):
        if self.connection:
            self.connection.close()

    def get_encode(self, data, istokenized=True, isblocked=True):
        '''
        :param inport: the port where bert service located
        :param outport: the port where the results get back
        :param sentences: list of sentences, (preprocessed, tokenized-rejoin)
        :return:
        encoded sentence/token-level embeddings, rows correspond to sentences
        :type:
        numpy.ndarray or list[list[float]]
        '''

        logger.info('sending new request...')
        try:
            # encode tokenized sentences
            result = self.connection.encode(data,
                                            blocking=isblocked,
                                            is_tokenized=istokenized)
            logger.info('encoding job done')
        except Exception as e:
            logger.error(f'getting encodes from BERT failed: {str(e)}')
            return []
        return result
예제 #7
0
class BertEncoder(BaseTextEncoder):
    store_args_kwargs = True
    is_trained = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._bc_encoder_args = args
        self._bc_encoder_kwargs = kwargs

    def post_init(self):
        from bert_serving.client import BertClient
        self.bc_encoder = BertClient(*self._bc_encoder_args, **self._bc_encoder_kwargs)

    @batching
    def encode(self, text: List[str], *args, **kwargs) -> np.ndarray:
        return self.bc_encoder.encode(text, *args, **kwargs)  # type: np.ndarray

    def close(self):
        self.bc_encoder.close()
예제 #8
0
def get_vec(question_list, prcess_conifg):
    question_vec = []

    if prcess_conifg.vec_type == "bert":
        bc = BertClient(ip="127.0.0.1")
        for question in question_list:
            question_vec.append(bert_vec(bc, question)[0])
        bc.close()

    if prcess_conifg.vec_type == "word":
        model_file = '/data/dataset/news_12g_baidubaike_20g_novel_90g_embedding_64.bin'
        model = gensim.models.KeyedVectors.load_word2vec_format(model_file,
                                                                binary=True)
        print("load 模型完成")
        for question in question_list:
            #去除英文、数字和其他字符,可以选择不要
            r_s = u'[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
            question_t = re.sub(r_s, '', question)
            question_vec.append(word_vec(model, question_t))

    return question_vec
예제 #9
0
class BertSupport:
    def __init__(self, use_timeout=True):
        if use_timeout:
            # Bert客户端请求BertServer服务,参数ip为服务地址
            self.bc = BertClient(ip=DEV_BERT_SERVER_IP,
                                 timeout=15000,
                                 check_version=False)
        else:
            self.bc = BertClient(ip=DEV_BERT_SERVER_IP, check_version=False)

    def compute_cosine(self, word1, word2):
        # 使用bert对该对文本进行encode成向量
        if (word1 == "" or word2 == ""):
            return 0
        a = self.bc.encode([word1, word2])
        vector_a = np.mat(a[0])
        vector_b = np.mat(a[1])
        num = float(vector_a * vector_b.T)
        denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
        # 计算向量余弦
        cos = num / denom
        # 计算相似度
        sim = 0.5 + 0.5 * cos
        return sim

    def word_list_vector(self, wordList):
        a = self.bc.encode(wordList)
        d = {}
        for i in range(a.__len__()):
            d[wordList[i]] = np.mat(a[i])
        return d

    def compute_distance(self, word1, word2):
        a = self.bc.encode([word1, word2])
        distance = a[0] - a[1]
        distance_sum = np.sum(distance)
        return np.abs(distance_sum)

    def close(self):
        self.bc.close()
예제 #10
0
args = namedtuple('args_namedtuple', ','.join(common.keys()))
for k, v in common.items():
    setattr(args, k, v)

for pool_layer in range(1, 13):
    setattr(args, 'pooling_layer', [-pool_layer])
    server = BertServer(args)
    server.start()
    print('wait until server is ready...')
    time.sleep(15)
    print('encoding...')
    bc = BertClient(port=common['port'],
                    port_out=common['port_out'],
                    show_server_config=True)
    subset_vec_all_layers.append(bc.encode(subset_text))
    bc.close()
    server.close()
    print('done at layer -%d' % pool_layer)


def vis(embed, vis_alg='PCA', pool_alg='REDUCE_MEAN'):
    plt.close()
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [21, 7]
    for idx, ebd in enumerate(embed):
        ax = plt.subplot(2, 6, idx + 1)
        vis_x = ebd[:, 0]
        vis_y = ebd[:, 1]
        plt.scatter(vis_x,
                    vis_y,
                    c=subset_label,
    def page_rank_texts(self, texts: list):
        #each txt in texts is tokenized
        from bert_serving.client import BertClient
        import networkx as nx
        #begin re-ranking

        if self.is_tokenized:
            for i in range(len(texts)):
                texts[i] = self.tokenizer.tokenize(texts[i])

        retry = 5
        while retry > 0:
            try:
                encoder = BertClient(ip="ring-gpu-3",
                                     port=5555,
                                     check_length=False,
                                     timeout=3000)
                encoded_texts = encoder.encode(texts,
                                               is_tokenized=self.is_tokenized)
                #print("encoded",len(encoded_texts))
                break

            except:
                encoder.close()
                retry -= 1
                if retry < 1:
                    print("error")
                    break
                print("left try", retry)

        G = nx.Graph()
        for i in range(len(texts)):
            for j in range(i):
                w = np.dot(encoded_texts[i], encoded_texts[j])
                G.add_edge(i, j, weight=w)
        rank_scores = nx.pagerank_numpy(G)
        ranks = sorted(rank_scores.items(), key=lambda x: x[1], reverse=True)
        #print(rank_scores)
        #print(ranks)

        ranks = list(map(lambda x: x[0], ranks))
        selected = []

        sumTokens = 0
        while ranks:
            #print("left ranks",ranks)

            txt = texts[ranks[0]]

            if self.is_tokenized:
                txt = " ".join(txt)

            curTokens = txt.split()

            sumTokens += len(curTokens)

            selected.append((txt, ranks[0]))

            if sumTokens > self.maxClip:
                break

            del ranks[0]

        selected = sorted(selected, key=lambda x: x[1])
        selected = map(lambda x: x[0], selected)
        selected = list(selected)

        return selected
예제 #12
0
파일: main.py 프로젝트: ws719547997/LNB-att
def main1(args):
    log_file = 'logs_' + args.name + str(args.start)  # 加start是为了互相区分开
    model_dir = 'models_' + args.name + str(args.start)
    exp = common.Experiment(log_file,
                            model_dir)  # 在这里内次运行前会把已经存在model_dir的同名目录删除
    # we = ph.utils.WordEmbedding()  # 修改了原函数 初始化之后可以直接通过embeding查询
    bert_client = BertClient(ip='202.201.242.38')
    trainer = Main(args.name + str(args.start), 768)  # 词向量 300维  输入了模型名称和300维
    ph.initialize_global_variables()
    #
    test_list = []

    for i in range(domain_num):
        print('***********************')
        print('domain:' + str(i) + dom_list[i])
        train_data, dev_data, test_data = build_dataset_LL(
            i, args, bert_client)  # 重写的数据读取 这里是读取一个领域的数据
        train_ds = common.TrainSource(train_data,
                                      i)  # 实现了源代码中的DataSource作为Dataloader
        dev_ds = common.TrainSource(dev_data, i)
        test_ds = common.TrainSource(test_data, i)
        test_list.append(test_ds)
        #
        exp.load_model(trainer)  # 先把之前的模型拿出来接着训练 第一次就从头训
        seq_stat = trainer.stat.read_stat(trainer.flat_seq)
        states_stat = trainer.stat.read_stat(trainer.flat_states)
        trainer._optimizer.update_mask(trainer.shared.cell.wz, seq_stat, i)
        trainer._optimizer.update_mask(trainer.shared.cell.wr, seq_stat, i)
        trainer._optimizer.update_mask(trainer.shared.cell.wh, seq_stat, i)
        trainer._optimizer.update_mask(trainer.shared.cell.uz, states_stat, i)
        trainer._optimizer.update_mask(trainer.shared.cell.ur, states_stat, i)
        trainer._optimizer.update_mask(trainer.shared.cell.uh, states_stat, i)
        trainer.add_data_trainer(train_ds, 64)  # 32 batch_size
        # trainer.add_screen_logger('train', ('Loss', 'Norm'), interval=1)  # 这里是输出训练过程的
        trainer.add_data_validator(test_ds, 64, interval=20)  # 相当于model.eval()
        # trainer.add_screen_logger(  # eval的结果
        #     "validate",
        #     ('hit_pos', 'hit_neg', 'pred_pos', 'pred_neg', 'Error'),
        #     message='[%d]' % i,
        #     interval=20
        # )
        trainer.add_fitter(common.DevFitter(dev_ds, 64, 20))
        trainer.fit(args.num_loops)
        trainer.clear_fitters()  # 训练过程到此结束
        #
        exp.dump_model(trainer)  # 模型存起来

        # test turn
        for test_data in test_list:
            trainer.add_data_validator(test_data, 64,
                                       interval=1)  # model.eval()
            trainer.add_screen_logger(  # 输出测试结果
                "validate",
                ('hit_pos', 'hit_neg', 'pred_pos', 'pred_neg', 'Error'),
                message='[%d]' % i,
                interval=1)
            trainer.fit(1)
            trainer.clear_fitters()
        trainer.stat.update_stats()
    bert_client.close()
    return 0
예제 #13
0
class CorpusSearcher(object):
    def __init__(self, model_file='spacy-2.2/data/embedded_corpus.pkl'):
        from os.path import expanduser
        self.bc = BertClient()
        self.model_file=expanduser(model_file)

    def train(self, quotes, source_col='text'):
        embeddings = self.bc.encode(quotes[source_col].to_list())
        quotes['EMBEDDINGS'] = embeddings.tolist()

        # Persist to pickle
        quotes.to_pickle(self.model_file)

    def train_corpus(self, data_file, source_col='text'):
        # f'{cf.conf_dir}/stack/crawlers/langcrs/all_{lang}.json'
        dfjson = pd.read_json(data_file)
        self.train(dfjson, source_col=source_col)

    def load_quotes_and_embeddings(self, file):
        quotes = pd.read_pickle(file)

        # change dtype in place for memory efficiency
        quotes['EMBEDDINGS'] = quotes['EMBEDDINGS'].apply(
            lambda arr: np.array(arr, dtype='float32')
        )

        quote_embeddings = np.stack(quotes.EMBEDDINGS.values)

        # reduce memory footprint by dropping column
        quotes.drop('EMBEDDINGS', axis='columns')

        # normalize embeddings for cosine distance
        embedding_sums = quote_embeddings.sum(axis=1)
        normed_embeddings = quote_embeddings / embedding_sums[:, np.newaxis]
        return quotes, normed_embeddings

    def create_index(self, embeddings):
        import faiss
        """
        Create an index over the quote embeddings for fast similarity search.
        """
        dim = embeddings.shape[1]
        index = faiss.IndexFlatL2(dim)
        index.add(embeddings)
        return index

    def search(self, text, cols, top_result=5):
        text_embedding = self.bc.encode([text])
        normalized_text_embedding = text_embedding / text_embedding.sum()
        quotes, embeddings = self.load_quotes_and_embeddings(self.model_file)
        index = self.create_index(embeddings)

        _, idx = index.search(normalized_text_embedding, top_result)

        # relevant_quotes = quotes.iloc[idx.flatten()].text.values
        # relevant_chapters = quotes.iloc[idx.flatten()].chapter.values
        rs=[]
        for col in cols:
            rs.append(quotes.iloc[idx.flatten()][col].values)
            # relevant_chapters = quotes.iloc[idx.flatten()]['chapter'].values
        return rs

    @staticmethod
    def parse_controls(results):
        rs = []
        for lang, v in results.items():
            for sent in v:
                rs.append((sent['translate'], lang, sent['translit']))
        return rs

    def run(self, text, langs=None, top_result=5, summary=False, verbose=True):
        """
        $ python -m sagas.corpus.searcher run 'I read a letter.'
        $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id
        $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id,fa 2 True False
        :param text:
        :return:
        """
        # 先按相似度查找到与给定内容近似的英文句子
        relevant_quotes, relevant_chapters = self.search(text, ['text', 'chapter'], top_result)
        summary_info=[]
        for q in range(top_result):
            tc.emp('magenta', '>' + relevant_quotes[q])
            tc.emp('green', relevant_chapters[q])

            if langs is not None:
                # 因为语料都是按英文作对照的, 所以直接按英文句子查找到其它语言的句子就可以了
                # search_in_list('I write a letter.', ['ja', 'fa', 'id'])
                results=search_in_list(relevant_quotes[q], langs)
                if verbose:
                    tc.emp('blue', json.dumps(results, indent=2, ensure_ascii=False))

                if summary:
                    all_types = []
                    rs_c=CorpusSearcher.parse_controls(results)
                    for r in rs_c:
                        if r[2]!='':
                            tc.emp('red', f".. {r[2]}")
                        types=sents_summary(r[0], r[1])
                        all_types.extend(types)
                    summary_info.append((relevant_quotes[q], all_types))

            tc.emp('cyan', '✁', '-' * 30)

        for s in summary_info:
            tc.info(s)

    def end(self):
        self.bc.close()