예제 #1
0
def launch_bert_as_service_server(model_name, layer, encoding_level = None, pooling_strategy = None):
    """
    Launches a BERT-as-service server used to encode the sentences using the designated BERT model
    https://github.com/hanxiao/bert-as-service

    Args:
        - :param: `model_name` (str): the specific bert model to use
        - :param: `layer` (int): the layer of representation to use
        - :param  'encoding_level' (int): n-gram encoding level - desginates how many word vectors to combine for each final embedding vector
                                        if 'none' -> embedding level defaults to the sentence level of each individual sentence
        - :param: `pooling_strategy` (str): the vector combination strategy - used when 'encoding_level' == 'sentence' 
    
    """

    model_path = bert_model_directories[model_name]
    pooling_layer = layers_base[layer]

    server_parameters = ""
        
    if encoding_level == None:

        if pooling_strategy not in pooling_strategies:
            print('"pooling_strategy" must be defined as one of the following:', pooling_strategies)
            return

        server_parameters = get_args_parser().parse_args(['-model_dir', model_path,
                                        '-port', '5555',
                                        '-port_out', '5556',
                                        '-max_seq_len', 'NONE',                                        
                                        '-pooling_layer', pooling_layer,
                                        '-pooling_strategy', pooling_strategy, 
                                        '-num_workers', '=1'])
    
    elif encoding_level >=1:
        server_parameters = get_args_parser().parse_args(['-model_dir', model_path,
                                        '-port', '5555',
                                        '-port_out', '5556',
                                        '-max_seq_len', 'NONE',                                        
                                        '-pooling_layer', pooling_layer,
                                        '-pooling_strategy', 'NONE',
                                        '-num_workers', '=1'])
    else:
        print('"encoding_level" must be >=1 or None, see README for descriptions')
        return

    server = BertServer(server_parameters)
    print("LAUNCHING SERVER, PLEASE HOLD", '\n')
    server.start()
    # Include a check here that ensures that the server is running before printing the below statement
    print("SERVER RUNNING, BEGGINING ENCODING...")
예제 #2
0
def get_model(TUNED_FLAG=False):
    args = [
        '-model_dir',
        'english_L-12_H-768_A-12/',
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-max_seq_len',
        'NONE',
        '-mask_cls_sep',
        'num_worker',
        '4',
        '-cpu',
    ]
    if TUNED_FLAG == True:
        args.extend([
            '-tuned_model_dir',
            '/tmp/mrpc_output/',
            '-ckpt_name',
            'model.ckpt-343',
        ])

    bert_args = get_args_parser().parse_args(args)
    server = BertServer(bert_args)
    server.start()
    BertServer.shutdown(port=5555)
예제 #3
0
    def _init_bert_client(model_dir, max_seq_len, device_map,
                          num_worker) -> BertClient:
        """Initialize bert client for sentence embeddings and avoid restarting bert-server if already running.

        For more information, see: https://github.com/hanxiao/bert-as-service
        Bert-server can take a long time to start, take over stdout during training, and create many temp log files.
        It's highly recommended to run bert-server beforehand from command-line in a dedicated folder:
        e.g:
        ~/gym-summarizer/data/bert $
            bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -max_seq_len 40 -device_map 1 2 3 4 -num_worker 4

        :param model_dir: directory containing bert model
        :param max_seq_len: max sequence length for bert
        :return bc: bert-client
        """

        try:
            bc = BertClient()
        except:
            from bert_serving.server.helper import get_args_parser
            from bert_serving.server import BertServer
            args = get_args_parser().parse_args([
                '-model_dir', model_dir, '-max_seq_len', max_seq_len,
                '-device_map', device_map, '-num_worker', num_worker
            ])
            server = BertServer(args)
            server.start()
            bc = BertClient()

        return bc
예제 #4
0
 def __init__(self, model_path):
     args = get_args_parser().parse_args([
         '-num_worker', '4', '-model_dir', model_path, '-port', '5555',
         '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep',
         '-cpu'
     ])
     # 详细说明,请参考:https://github.com/hanxiao/bert-as-service
     self._server = BertServer(args)
def main():
    args = get_args_parser().parse_args([
        '-model_dir', r'../data/chinese_L-12_H-768_A-12', '-port', '86500',
        '-port_out', '86501', '-max_seq_len', '512', '-mask_cls_sep', '-cpu'
    ])

    bs = BertServer(args)
    bs.start()
예제 #6
0
def start_server(max_seq_len, pretrained_model):
    args = get_args_parser().parse_args([
        '-model_dir', pretrained_model, '-port', '5555', '-port_out', '5556',
        '-pooling_strategy', 'NONE', '-show_tokens_to_client', '-max_seq_len',
        str(max_seq_len), '-mask_cls_sep', '-cpu'
    ])
    server = BertServer(args)
    server.start()
예제 #7
0
def main():
    args = get_args_parser().parse_args([
        '-model_dir', './uncased_L-12_H-768_A-12', '-port', '5555',
        '-port_out', '5556', '-max_seq_len', '25', '-num_worker', '1',
        '-mask_cls_sep', '-cpu'
    ])
    server = BertServer(args)
    server.start()
예제 #8
0
def main():
    args = get_args_parser().parse_args([
        '-model_dir', './biobert', '-ckpt_name', 'model.ckpt-1000000', '-port',
        '5555', '-port_out', '5556', '-max_seq_len', '30', '-num_worker', '1',
        '-mask_cls_sep', '-cpu'
    ])
    server = BertServer(args)
    server.start()
예제 #9
0
def start_bert_server():
    from bert_serving.server.helper import get_args_parser
    from bert_serving.server import BertServer
    args = get_args_parser().parse_args(['-model_dir', 'YOUR_MODEL_PATH_HERE',
                                         '-port', '5555',
                                         '-port_out', '5556',
                                         '-num_worker',
                                         '-cpu'])
    server = BertServer(args)
    server.start()
def main():
    args = get_args_parser().parse_args(
        ['-model_dir', 'uncased_L-12_H-768_A-12'])
    #                                      ,'-port', '5555',
    #                                      '-port_out', '5556',
    #                                      '-max_seq_len', 'NONE',
    #                                      '-mask_cls_sep',
    #                                      '-cpu'])
    server = BertServer(args)
    server.start()
예제 #11
0
 def __init__(self):
     args = get_args_parser().parse_args([
         '-model_dir',
         '/Data_HDD/zhipengye/projects/bert/multi_cased_L-12_H-768_A-12',
         '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE',
         '-mask_cls_sep', '-cpu'
     ])
     self.server = BertServer(args)
     self.server.start()
     print('bert sever has started')
def save_emb():

    common = [
        '-model_dir',
        '/home/ydu/BERT/uncased_L-12_H-768_A-12/',
        '-num_worker',
        '2',
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-max_seq_len',
        '128',
        '-max_batch_size',
        '256',
        # '-tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data/',
        # '-ckpt_name', 'model.ckpt-2500',
    ]

    args = get_args_parser().parse_args(common)

    # folder = ['books', 'dvd', 'electronics', 'kitchen']
    data_path = '/home/ydu/BERT/DATA/'
    data_folder = ['metacritic', 'imdb', 'amazon', 'reddit']

    # model_path = 'home/ydu/BERT/bert_mgpu/results/'
    # model_folder = 'amazon-balanced/'
    # model_type = 'bert-tune'
    data = {}

    # setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/reddit-pretrain')
    # setattr(args, 'ckpt_name', 'model.ckpt-2500')
    setattr(args, 'tuned_model_dir',
            '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data')
    setattr(args, 'ckpt_name', 'model.ckpt-2500')

    for d in data_folder:
        fn = data_path + d + '/all.tsv'
        print("===========", fn, "================")
        text = read_tsv(fn)
        server = BertServer(args)
        server.start()
        print('wait until server is ready...')
        time.sleep(20)
        print('encoding...')
        bc = BertClient()
        data[d] = bc.encode(text)
        bc.close()
        server.close()

    pickle_name = data_path + 'EMB/allpre_emb.pickle'
    with open(pickle_name, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return pickle_name
예제 #13
0
def bert_server_start():
    # 感谢哈工大人工智能团队提供的bert服务
    args = get_args_parser().parse_args(['-num_worker', '1',
                                         '-model_dir', BERT_MODEL_PATH,
                                         '-port', '5555',
                                         '-port_out', '5556',
                                         '-max_seq_len', 'NONE',
                                         '-mask_cls_sep',
                                         '-cpu'])
    bert_server = BertServer(args)
    bert_server.start()
def natural_language_to_embeddings(dataset_file):#dataset in a CSV File format
    args = get_args_parser().parse_args(['-model_dir', 'C:\\Users\\Ronak\\Desktop\\uncased_L-12_H-768_A-12','-port', '5555','-port_out', '5556','-max_seq_len', 'NONE', '-mask_cls_sep','-cpu'])
    server = BertServer(args)
    server.start()
    bc = BertClient()
    df=pd.read_csv(dataset_file)
    l=df.values.tolist()
    nat_lan_sen=[]
    for i in l:
        nat_lan_sen.append(i[0])
    sen_encodings=bc.encode(nat_lan_sen)
    return sen_encodings
def bert_service_start(switch=True):  # 启动bert服务 并启动服务监控 输入参数为是否开关服务
    from bert_serving.server.helper import get_args_parser
    from bert_serving.server import BertServer
    from bert_serving.client import BertClient

    args = get_args_parser().parse_args(
        ['-model_dir', 'models\chinese_L-12_H-768_A-12'])
    server = BertServer(args)
    if switch:
        server.start()
    else:
        pass
예제 #16
0
def extract_topics_all(issues_path, model_dir, topic_file, n_topics):
    """Extract topics for all issues with top n_topics topics"""
    topic_all = []
    text_all, divide_list = combine_issues(issues_path)
    topics = tp.get_topic_list(topic_file)
    topic_embedding = tp.get_topic_embedding(topics, port=3500, port_out=3501, model_path=model_dir)
    #topic_embedding = np.load('../output/topic_embedding.npy')
    print('topic embedding shape = ', topic_embedding.shape)
    stop_words = tp.expand_stopwords()
    print(len(stop_words))
    text_flat_tokenized, text_article_tokenized = tp.bert_tokens(text_all)
    tfidf_biglist = tp.tfidf_vec(text_flat_tokenized, stop_words)
    port_in = 6550
    port_out = 6551
    tmp_dir = './output/tmp'
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)
    ZEROMQ_SOCK_TMP_DIR=tmp_dir
    common = [
        '-model_dir', model_dir,
        '-num_worker', '2',
        '-port', str(port_in),
        '-port_out', str(port_out),
        '-max_seq_len', '20',
        '-max_batch_size', '256',
        '-pooling_strategy', 'NONE',
        '-pooling_layer', '-2',
        '-graph_tmp_dir', tmp_dir,
        '-cpu',
        '-show_tokens_to_client',
    ]
    args = get_args_parser().parse_args(common)
    server = BertServer(args)
    server.start()
    print('wait until server is ready...')
    time.sleep(20)
    print('encoding...')        
    for issue_num in range(len(text_all)):  
        #issue_num = 0
        divide_list_each = divide_list[issue_num]
        text_one_issue = text_all[issue_num]
        vec = tp.get_word_embedding_server_on(text_one_issue, port=port_in, port_out=port_out)
        topics_issue, sort_topic_sim = tp.get_topics_one_issue(vec,topic_embedding,topics, divide_list_each, 
                                               tfidf_biglist, issue_num, n_topics)
        topic_all.append(topics_issue)       
    server.close()
    topic_folder = './output/topic'
    if not os.path.isdir(topic_folder):
        os.makedirs(topic_folder)
    with open(topic_folder + '/topic.pkl', 'wb') as f:
        pickle.dump(topic_all, f)
    return topic_all
예제 #17
0
def start():

    args = get_args_parser().parse_args([
        '-model_dir',
        '/Users/henry/Documents/application/multi-label-bert/data/chinese_L-12_H-768_A-12/',
        '-tuned_model_dir',
        '/Users/henry/Documents/application/nlp_assignments/data/KnowledgeLabel/corpus2/output/',
        '-port', '12544', '-ckpt_name', 'model.ckpt-1000', '-port_out',
        '12546', '-http_port', '12547', '-max_seq_len', '128', '-mask_cls_sep',
        '-show_tokens_to_client', '-pooling_strategy', 'NONE', '-cpu'
    ])
    server = BertServer(args)
    server.start()
예제 #18
0
def start_bert_server():
    args = get_args_parser().parse_args([
        '-model_dir',
        BERT_MODEL_PATH,
        '-max_seq_len',
        str(MAX_TEXTLEN),
        '-max_batch_size',
        str(MAX_SEQLEN),
        #'-pooling_strategy', 'NONE',
        '-num_worker',
        str(multiprocessing.cpu_count()),
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-cased_tokenization',
        '-cpu'
    ])
    server = BertServer(args)
    server.start()
예제 #19
0
 def __init__(self, path,  port='5555', port_out='5556', pooling_strategy='REDUCE_MEAN'):
     """
     Word vocabulary initialization
     
     Args:
         path (str): BERT pretrained vector path
         port (str, optional, defaults to '5555'): server port for receiving data from client
         port_out(str, optional, defaults to '5556'): server port for sending result to client
         pooling_strategy(str, optional, defaults to `REDUCE_MEAN`): {NONE, REDUCE_MAX, REDUCE_MEAN, REDUCE_MEAN_MAX, FIRST_TOKEN, LAST_TOKEN}
     """
     self.__port = port
     self.__port_out = port_out
     args = get_args_parser().parse_args(['-model_dir', path,
                                  '-port', self.__port,
                                  '-port_out', self.__port_out,
                                  '-max_seq_len', 'NONE',
                                  '-mask_cls_sep',
                                  '-cpu',
                                  '-pooling_strategy', pooling_strategy])
     self.__server = BertServer(args)
     self.__server.start()
예제 #20
0
def get_word_embedding(rpath, wpath):
    args = get_args_parser().parse_args([
        '-model_dir', BERT_MODEL_PATH, '-max_seq_len',
        str(MAX_TEXTLEN), '-max_batch_size',
        str(MAX_SEQLEN), '-pooling_strategy', 'NONE', '-num_worker', '8',
        '-port', '5555', '-port_out', '5556', '-cased_tokenization', '-cpu'
    ])
    server = BertServer(args)
    server.start()
    bc = BertClient()
    with open(wpath, 'w') as wf:
        with open(rpath, 'r') as rf:
            lines = rf.readlines()
            for line in tqdm(lines, total=len(lines)):
                user = json.loads(line.strip())
                tips = [
                    t['text']
                    for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN]
                ]
                emb_tips = bc.encode(tips)
                user['fsq']['tips']['tips embedding'] = emb_tips.tolist()
                wf.write(json.dumps(user) + '\n')
    BertServer.shutdown(args)
예제 #21
0
from bert_serving.server.helper import get_args_parser
from bert_serving.server import BertServer
args = get_args_parser().parse_args(
    ['-model_dir', ' F:\learn\ngnlab\KG\chinese_L-12_H-768_A-1'])
server = BertServer(args)
server.start()
예제 #22
0
from bert_serving.server.helper import get_args_parser
from bert_serving.server import BertServer
from bert_serving.client import BertClient
from bert_serving.server.bert import tokenization
import time

if len(sys.argv) < 4:
    print('please provide embeddings, conl file and port')
    exit(0)

port1 = int(sys.argv[3])
port2 = port1 + 1

args = get_args_parser().parse_args([
    '-model_dir', sys.argv[1], '-port',
    str(port1), '-port_out',
    str(port2), '-max_seq_len', 'NONE', '-pooling_strategy', 'NONE',
    '-mask_cls_sep', '-cpu'
])
print('starting bert')
server = BertServer(args)
server.start()
print('started')

#os.system('bert-serving-start -pooling_strategy NONE -model_dir ' + sys.argv[1] + '  -num_worker=1 > /dev/null 2> /dev/null &')

time.sleep(30)  #is this necessary?

print('starting client')
bc = BertClient(port=port1, port_out=port2)
print('done')
time.sleep(30)  # is this necessary?
예제 #23
0
from bert_serving.server.helper import get_args_parser
from bert_serving.server import BertServer

args = get_args_parser().parse_args([
    '-model_dir', 'cased', '-port', '7010', '-port_out', '7011',
    '-max_seq_len', 'NONE', '-mask_cls_sep', '-num_worker', '1', '-cpu'
])
server = BertServer(args)
server.start()
예제 #24
0
from bert_serving.server.helper import get_args_parser
from bert_serving.server import BertServer
import pandas as pd

from bert_serving.client import BertClient

file_path = '/Users/yuchk/PycharmProjects/IMDB/0_dataset/orign/train_imdb.tsv'
model_path = '/Users/yuchk/PycharmProjects/IMDB/0_dataset/bert_model'
res_file_path = '/Users/yuchk/PycharmProjects/IMDB/0_dataset/orign/encode_train_imdb.tsv'

args = get_args_parser().parse_args([
    '-model_dir', model_path, '-port', '5558', '-port_out', '5559',
    '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu'
])
server = BertServer(args)
server.start()
bc = BertClient(port=5558, port_out=5559)

df = pd.read_csv(file_path, usecols=['sen', 'tag'], sep='\t')
df['encode'] = df['sen'].apply(lambda x: bc.encode([x])[0])
df.to_csv(res_file_path,
          sep="\t",
          encoding="utf-8",
          columns=['sen', "tag", "encode"],
          header=True,
          index=False)
from bert_serving.server import BertServer
import spacy
from spacymoji import Emoji

#in the following tweets stand for Facebook post

line_done = 0
#multi_cased_L-12_H-768_A-12
#uncased_L-12_H-768_A-12
bert_model_dir = 'pretrained_bert/multi_cased_L-12_H-768_A-12'

check_empty = ""
output_file = open("train_whole_lines.csv", "a")

args = get_args_parser().parse_args(['-model_dir',bert_model_dir,
'-port', '5555', '-port_out', '5556',
'-max_seq_len','NONE', '-mask_cls_sep','-cpu','-num_worker=1', '-pooling_strategy', 'CLS_TOKEN'])


server = BertServer(args)
server.start()
bc = BertClient(ip = 'localhost')

#large vocabulary used in final solution
#nlp = spacy.load("en_core_web_lg")
#small vocabulary is used for testing purpose
print("spaCy en_core_web_sm loading...")
nlp = spacy.load("en_core_web_sm")
print("spaCy loaded")
# we use this library to translate image unicode of emoji ":)" into words "smiling face"
emoji = Emoji(nlp) 
예제 #26
0
파일: bert.py 프로젝트: qiekub/ideas
# on another CPU machine
# from bert_serving.client import BertClient
# bc = BertClient(ip='127.0.0.1')  # ip address of the GPU machine
# bc.encode(['First do it', 'then do it right', 'then do it better'])


from bert_serving.server.helper import get_args_parser
from bert_serving.server import BertServer
args = get_args_parser().parse_args(['-model_dir', './multi_cased_L-12_H-768_A-12/',
                                     '-port', '5555',
                                     '-port_out', '5556',
                                     '-max_seq_len', 'NONE',
                                     '-mask_cls_sep',
                                     '-cpu'])
server = BertServer(args)
server.start()

LYcTaWoRu4b8bjGh
예제 #27
0
import os

from bert_serving.server import BertServer
from bert_serving.server.helper import get_args_parser

# Avoid conflict
USE_CPU = True
arg_list = [
    '-model_dir',
    os.path.join(os.getcwd(), 'uncased_L-12_H-768_A-12'), '-port', '23333',
    '-num_worker=1'
]
if USE_CPU:
    arg_list.append('-cpu')
args = get_args_parser().parse_args(arg_list)

if __name__ == '__main__':
    server = BertServer(args)
    server.start()
예제 #28
0
파일: data_loader.py 프로젝트: lucas0/Lux
def load_data(emb_type='w2v', collapse_classes=False, fold=None, num_folds=1, random_state=None, force_reload=False, drop_feat_idx=[]):
    print('Loading data from',dataset_dir)
    data = pd.read_csv(dataset_dir+"/dataset.csv", sep=',')

    if force_reload: reset_hash()

    print("size of initial \"dataset\":",len(data))
    data = data.drop_duplicates(subset='o_url', keep='first')
    print("after dropping duplicates:",len(data))
    data.o_body = data.o_body.astype('str')
    data.verdict = data.verdict.astype('str')
    data['verdict'] = data['verdict'].str.lower()
    #data = data[data['o_body'].map(len) > MIN_BODY_LEN]
    #print("after dropping origins with less than "+str(MIN_BODY_LEN)+" chars:",len(data))
    data = data.reset_index()

    if(collapse_classes):
        print("labels before collapse classes:", data.verdict.unique())
        data.loc[data['verdict'] == "mfalse", 'verdict'] = 'false'
        data.loc[data['verdict'] == "mtrue", 'verdict'] = 'true'

    labels = ['true', 'false']
    print(data['verdict'].value_counts())
    data = data.loc[data.verdict.isin(labels)]
    print("considered labels:", data.verdict.unique())
    print("after dropping invalid labels:",len(data))

    #creating hash
    json_data = data.to_json().encode()
    data = data.sample(frac=1, random_state=random_state)
    df_hash = hashlib.sha256(json_data).hexdigest()

    labels_idx = [labels.index(label) for label in labels]
    labels_one_hot = np.eye(len(labels))[labels_idx]
    label_to_oh = {label:labels_one_hot[labels.index(label)] for label in labels}

    print("MEMORY: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    assert (num_folds > 2), "Needs at least three folds for Dev/Train/Test to be different from each other"
    #generate and save the folds:
    for fold in range(num_folds):
        bucket_size = int(len(data.index)/num_folds)
        fold_dev = fold+1
        if fold == num_folds-1:
            fold_dev = 0

    if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx):
        #TODO modify these two lines back!!!
        df = data[['o_body','verdict']].copy()
        #df = data[['claim','verdict']].copy()
        df = df.rename(columns={"o_body": "body"})
        #df = df.rename(columns={"claim": "body"})
        df.body.apply(clean_text)

        lens = np.asarray([len(e.split(" ")) for e in df['body'].values])
        #df = df[lens < MAX_SENT_LEN]
        df.reset_index(drop = True, inplace = True)
        df.to_csv(data_dir+'/data.csv', sep="\t", index=False)
        num_entries = len(df)

        #plots the data distribution by number of words
        print("Number of entries: ", num_entries)
        print("True/False: ",df.groupby('verdict').count())
        print("Mean and Std of number of words per document: ",np.mean(lens),np.std(lens), "\n")
        #sns.distplot(lens)
        #plt.show()

        ###################################
        ############# FEATURES ############
        ###################################
        #check if new linguistic features should be generated
        flag_concat = False
        if not check_hash(df_hash, num_folds, stage="complexity"):
            flag_concat = True
            #Generate the features ndarray and save it to a pickle
            try:
                feat.generate_complexity()
            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while GENERATING COMPLEXITY. Press any key to exit.")
                sys.exit(1)
            savehash("complexity", hashcode=df_hash)
        if not check_hash(df_hash, num_folds, stage="specificity"):
            flag_concat = True
            try:
                feat.generate_specificity()
            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while GENERATING SPECIFICITY. Press any key to exit.")
                sys.exit(1)
            savehash("specificity", hashcode=df_hash)

        if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="features"):
            flag_concat = True
            try:
                features = feat.generateFeats()
            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while GENERATING FEATURES. Press any key to exit.")
                sys.exit(1)
            save_p(data_dir+"/features", features)
            print("Generated Features. Saved to pickle.")
            print("Features Shape:", features.shape)
            savehash("features", hashcode=df_hash, drop_feat_idx=drop_feat_idx)

        #check if drop_features is NOT the same
        if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="drop_feat"):
            flag_concat = True
            savehash("drop_feat", hashcode=df_hash, drop_feat_idx=drop_feat_idx)

        print("MEMORY AFTER FEATURES: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        ####################################
        ############### BERT ###############
        ####################################
        #check if new bert should be generated
        if not check_hash(df_hash, num_folds, stage="bert"):
            try:
                #creates the shuffle order (not random)
                index_shuf = list(range(len(df)))

                #creates a list of N=folds lists, each inner list contains the index of the elements of each fold
                bert_folds = np.array_split(index_shuf, num_folds)
                bert_folds = [a.tolist() for a in bert_folds]

                #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry
                fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)]

                #I think this should start as True
                flag = {idx:True for idx in range(len(bert_folds))}

                #get the starting time:
                start_time = time.time()

                #start the bert-as-a-service server
                bert_dir = os.environ.get("BERT_BASE_DIR")
                print(bert_dir)
                args = get_args_parser().parse_args(['-model_dir', bert_dir, '-port', '5555', '-port_out', '5556', '-max_seq_len', '512', '-mask_cls_sep'])
                server = BertServer(args)
                server.start()

                print(num_folds)
                #delete the bert.csv files inside the folds
                for i in range(num_folds):
                    filename = data_dir+"/folds/"+str(i)+"/bert.csv"
                    if os.path.exists(filename):
                        subprocess.call("rm -rf "+filename, shell=True, cwd=data_dir)

                #TODO make this process read only one fold at a time
                for fold, idx in zip(fold_idx, index_shuf):

                    #generates the encodings for the texts
                    bc = BertClient(check_version=False)
                    b = bc.encode([df.body[idx]])[0]

                    bert_df = pd.DataFrame([b], columns=['f'+str(e) for e in range(len(b))])
                    bert_df.to_csv(data_dir+"/folds/"+str(fold)+"/bert.csv", mode='a+', index=False, header=flag[fold])
                    flag[fold] = False

                #stops the bert-as-a-service server
                shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000'])
                server.shutdown(shut_args)

                #print total time
                delta_time = time.time() - start_time
                print('Time Taken: for BERT generation:', time.strftime("%H:%M:%S",time.gmtime(delta_time)))


            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while fine training BERT. Press any key to exit.")
                sys.exit(1)

            print("BERT Embeddings Saved")
            savehash("bert", df_hash)

        #########################################
        ## CONCATENATION, SHUFFLING AND SAVING ##
        #########################################

        #if not check_hash(df_hash, num_folds, stage="concat"):
        if flag_concat:
            features = read_p(data_dir+"/features")
            features = np.delete(features,drop_feat_idx,axis=1)

            #normalize features
            features = np.nan_to_num(features)
            features_t = features.T
            for c in range(features_t.shape[0]):
                row = features_t[c]
                features_t[c] = np.interp(row, (np.min(row), np.max(row)), (-2, +2))
            features = features_t.T
            #delete labels and folds folders
            for i in range(num_folds):
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/labels", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/features+bert.csv", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/bert", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/only_bert", shell=True, cwd=data_dir)

            #creates the shuffle order (not random)
            index_shuf = list(range(len(df)))

            #LABELS
            labels = [label_to_oh[label].tolist() for label in df['verdict'].values.tolist()]
            labels = [labels[i] for i in index_shuf]
            label_folds = np.array_split(labels, num_folds)

            for i in range(num_folds):
                fold_dir = data_dir+"/folds/"+str(i)
                if not os.path.exists(fold_dir):
                    os.mkdir(fold_dir)
                save_p(fold_dir+"/labels", label_folds[i])

            #creates a list of N=folds lists, each inner list contains the index of the elements of each fold
            bert_folds = np.array_split(index_shuf, num_folds)
            bert_folds = [a.tolist() for a in bert_folds]

            #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry
            fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)]

            #TODO make this process read only one fold at a time
            for fold in range(num_folds):
                b_fold_csv = pd.read_csv(data_dir+"/folds/"+str(fold)+"/bert.csv")
                #gets only the indexes
                count = sum([1 for fidx,_ in zip(fold_idx, index_shuf) if fold == fidx])
                for idx in range(count):
                    #print("csv:",b_fold_csv)
                    #print("len",len(b_fold_csv))
                    #print("count: ", count)
                    #print("range(count): ",range(count))
                    b = b_fold_csv.iloc[idx]
                    entry = np.concatenate((features[idx,:],b))

                    feat_df = pd.DataFrame([entry], columns=['f'+str(e) for e in range(len(entry))])
                    feat_df.to_csv(data_dir+"/folds/"+str(fold)+"/features+bert.csv", mode='a+', index=False, header=False)

            for i in range(num_folds):
                fold_dir = data_dir+"/folds/"+str(i)
                bert = np.genfromtxt(fold_dir+"/features+bert.csv", delimiter=',')
                only_bert = np.genfromtxt(fold_dir+"/bert.csv", delimiter=',')
                print("saving bert fold ",str(i), bert.shape)
                save_p(fold_dir+"/bert", bert)
                save_p(fold_dir+"/only_bert", only_bert)

            print("MEMORY AFTER FOLDS SAVING: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

            savehash("concat", hashcode=df_hash)

        checks = ["bert", "features", "concat", "complexity", "specificity"]

        for e in checks:
            print(e)
            print(check_hash(df_hash,num_folds,stage=e))
            if not (check_hash(df_hash,num_folds,stage=e, drop_feat_idx=drop_feat_idx)):
                print('Problem at Generation of data!')
                print("Stage: "+e)
                return

        print('Generation of data successfully done!')
        savehash("data", hashcode=df_hash)
        savehash("folds", hashcode=str(num_folds))

        return load_data(emb_type=emb_type, collapse_classes=collapse_classes, fold=fold, num_folds=num_folds, random_state=random_state, drop_feat_idx=drop_feat_idx)


    else:
        print("Reading already processed data")
        #returns the selected emb type (bert/w2v)
        test_data = read_p(data_dir+"/folds/"+str(fold)+"/"+emb_type)
        test_target = read_p(data_dir+"/folds/"+str(fold)+"/labels")

        dev_data = read_p(data_dir+"/folds/"+str(fold_dev)+"/"+emb_type)
        #dev_data = np.ndarray(dev_data)
        dev_target = read_p(data_dir+"/folds/"+str(fold_dev)+"/labels")

        train_data_filenames = [data_dir+"/folds/"+str(i)+"/"+emb_type for i in range(num_folds) if i not in [fold,fold_dev]]
        train_data = np.concatenate([read_p(fn) for fn in train_data_filenames], axis=0)
        train_target_filenames = [data_dir+"/folds/"+str(i)+"/labels" for i in range(num_folds) if i not in [fold,fold_dev]]
        train_target = np.concatenate([read_p(fn) for fn in train_target_filenames], axis=0)

        return train_data, train_target, dev_data, dev_target, test_data, test_target, label_to_oh
예제 #29
0
common = [
    '-model_dir', '/bert_model/chinese_L-12_H-768_A-12/',
    '-num_worker', '2',
    '-port', str(port),
    '-port_out', str(port_out),
    '-max_seq_len', '20',
    # '-client_batch_size', '2048',
    '-max_batch_size', '256',
    # '-num_client', '1',
    '-pooling_strategy', 'REDUCE_MEAN',
    '-pooling_layer', '-2',
    '-gpu_memory_fraction', '0.2',
    '-device','3',
]
args = get_args_parser().parse_args(common)

for pool_layer in range(1, 13):
    setattr(args, 'pooling_layer', [-pool_layer])
    server = BertServer(args)
    server.start()
    print('wait until server is ready...')
    time.sleep(20)
    print('encoding...')
    bc = BertClient(port=port, port_out=port_out, show_server_config=True)
    subset_vec_all_layers.append(bc.encode(subset_text))
    bc.close()
    server.close()
    print('done at layer -%d' % pool_layer)

#save bert vectors and labels
예제 #30
0
    def precompute_embeddings(path_to_binary: Path, path_to_batches: Path,
                              batch_size: int = 100,
                              bert_model_dir: str = "bert/uncased_L-12_H-768_A-12"):
        """Precompute and store sentence embeddings, along with articles and summaries, in batches of 100.


        :param path_to_binary:  Path to pre-tokenized binaries (train/valid/test.bin,
                                from https://github.com/JafferWilson/Process-Data-of-CNN-DailyMail)
        :param path_to_batches: Path prefix for storing batches of embeddings/articles/summaries
        :param batch_size:      Number of articles per batch.
        :param bert_model_dir:  Directory of bert model (https://github.com/hanxiao/bert-as-service)
        """
        from bert_serving.client import BertClient
        from tensorflow.core.example import example_pb2
        import struct
        import nltk
        nltk.download('punkt')

        # load bert client (and bert server if not already running)
        try:
            bc = BertClient()
        except:
            from bert_serving.server.helper import get_args_parser
            from bert_serving.server import BertServer

            args = get_args_parser().parse_args(['-model_dir', bert_model_dir,
                                                 '-max_seq_len', 40,
                                                 '-num_worker', 4,
                                                 '-device_map', '1,2,3,4'])
            server = BertServer(args)
            server.start()
            bc = BertClient()
        print("Bert client loaded...")

        # load articles and summaries
        articles: List[List[str]] = []
        summaries: List[str] = []
        reader = open(path_to_binary, 'rb')
        i = 0
        while True:
            len_bytes = reader.read(8)
            if not len_bytes: break  # finished reading this file
            str_len = struct.unpack('q', len_bytes)[0]
            example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
            example = example_pb2.Example.FromString(example_str)
            try:
                article = example.features.feature['article'].bytes_list.value[0].decode('utf-8')
                summary = example.features.feature['abstract'].bytes_list.value[0].decode('utf-8')
                if len(article) != 0:
                    articles.append(nltk.sent_tokenize(article))
                    summaries.append(summary.replace("<s>", "").replace("</s>", ""))
                    i += 1
                    if not i % 1000: print(f"loaded {i} articles...")
            except ValueError:
                print("Failed retrieving an article or abstract.")
        print(f"Articles and summaries read from path: {path_to_binary}...")

        # precompute embeddings, and store batches of embeddings/articles/summaries
        for i in tqdm(range(0, len(articles), batch_size)):
            j = min(len(articles), i + batch_size)
            print(f"embedding articles {i}-{j}...")
            a = articles[i:j]
            s = summaries[i:j]
            articles_tensor = bc.encode(sum(a, []))

            np.savez_compressed(f"{path_to_batches}.article_tensors.{i}.npz", articles_tensor)

            with open(f"{path_to_batches}.sentencized_articles.{i}.pkl", 'wb') as f:
                pickle.dump(a, f)

            with open(f"{path_to_batches}.summaries.{i}.pkl", 'wb') as f:
                pickle.dump(s, f)

            i += batch_size