コード例 #1
0
ファイル: bert_as_service.py プロジェクト: rkhood/bert
def get_model(TUNED_FLAG=False):
    args = [
        '-model_dir',
        'english_L-12_H-768_A-12/',
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-max_seq_len',
        'NONE',
        '-mask_cls_sep',
        'num_worker',
        '4',
        '-cpu',
    ]
    if TUNED_FLAG == True:
        args.extend([
            '-tuned_model_dir',
            '/tmp/mrpc_output/',
            '-ckpt_name',
            'model.ckpt-343',
        ])

    bert_args = get_args_parser().parse_args(args)
    server = BertServer(bert_args)
    server.start()
    BertServer.shutdown(port=5555)
コード例 #2
0
def get_sentence_embedding(rpath, wpath):
    bc = BertClient()
    with open(wpath, 'w') as wf:
        with open(rpath, 'r') as rf:
            lines = rf.readlines()
            for line in tqdm(lines, total=len(lines)):
                user = json.loads(line.strip())
                tips = [
                    t['text']
                    for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN]
                ]
                emb_tips = bc.encode(tips)
                user['fsq']['tips']['tips embedding'] = emb_tips.tolist()
                wf.write(json.dumps(user) + '\n')
    BertServer.shutdown()
コード例 #3
0
class MyBertServer():
    def __init__(self):
        args = get_args_parser().parse_args([
            '-model_dir',
            '/Data_HDD/zhipengye/projects/bert/multi_cased_L-12_H-768_A-12',
            '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE',
            '-mask_cls_sep', '-cpu'
        ])
        self.server = BertServer(args)
        self.server.start()
        print('bert sever has started')

    def shutdown(self):
        self.server.shutdown(port=5555)

    def start(self):
        self.server.start()
コード例 #4
0
def get_word_embedding(rpath, wpath):
    args = get_args_parser().parse_args([
        '-model_dir', BERT_MODEL_PATH, '-max_seq_len',
        str(MAX_TEXTLEN), '-max_batch_size',
        str(MAX_SEQLEN), '-pooling_strategy', 'NONE', '-num_worker', '8',
        '-port', '5555', '-port_out', '5556', '-cased_tokenization', '-cpu'
    ])
    server = BertServer(args)
    server.start()
    bc = BertClient()
    with open(wpath, 'w') as wf:
        with open(rpath, 'r') as rf:
            lines = rf.readlines()
            for line in tqdm(lines, total=len(lines)):
                user = json.loads(line.strip())
                tips = [
                    t['text']
                    for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN]
                ]
                emb_tips = bc.encode(tips)
                user['fsq']['tips']['tips embedding'] = emb_tips.tolist()
                wf.write(json.dumps(user) + '\n')
    BertServer.shutdown(args)
コード例 #5
0
ファイル: __init__.py プロジェクト: mmohaveri/bert-as-service
def terminate():
    from bert_serving.server import BertServer
    from bert_serving.server.helper import get_run_args, get_shutdown_parser
    args = get_run_args(get_shutdown_parser)
    BertServer.shutdown(args)
コード例 #6
0
def terminate():
    args = get_run_args(get_shutdown_parser)
    BertServer.shutdown(args)
コード例 #7
0
def stop(port=12544, port_out=12546):
    BertServer.shutdown(port=port, port_out=port_out)
コード例 #8
0
ファイル: BVSS.py プロジェクト: Lukecn1/evaluation_metric
def get_embedding_vectors(candidate_summaries, reference_summaries, n_gram_encoding: None, model, layer, pooling_strategy):
    """
    Generates the embedding vectors for the given sentences/tokens
    Uses the BERT as Service Client to produce the vectors. 

    Args:
        - :param: `candidate_summaries` (list of list of strings): candidate summaries to be encoded - each summary should be represented as a list of sentences
        - :param: `reference_summaries` (list of list of strings): reference summaries to be encoded - each summary should be represented as a list of sentences

        - :param  'n_gram_encoding' (int): n-gram encoding level - desginates how many word vectors to combine for each final embedding vector
                                        if 'None' -> embedding level defaults to the sentence level of each individual sentence
        
        - :param: `model` (str): the specific bert model to use
        - :param: `layer` (int): the layer of representation to use.
        - :param: `pooling_strategy` (str): the vector combination strategy 
        
    Return:
        - :param: candidate_embeddings, reference_embeddings (list of lists of float): list of embedding vectors for the summaries
                  each summary has a list of vectors (i.e. a matrix)
    """

    """
    notes: 

    start server from distiinct method for that --> parse the arguments as the guide shows

    different pooling strategies --> specify which to be used

    Should not truncate the sentences --> set encode parameter for this to "not truncate"

    include list of valid values for vector_level --> n-gram, sentence etc. (Scale up to n = 1 and up so that 1 vector for an entire summary)

    return_tensors: set the value so that it returns the torch sensors

    steps:

    3) Extract and combine at the designated level
    4) return the final vectors
    5) ensure that the method that launches the server is placed in a "main" function call b/c of windows' multi-threading issues 

    """

    launch_bert_as_service_server()
    bert_client = BertClient()

    candidate_embeddings = []
    reference_embeddings = []

    #Generates the embedding vectors for each summary. If the 
    for i in range(len(candidate_embeddings)):
        candidate_embeddings.append(bert_client.encode(candidate_summaries[i]))
        reference_embeddings.append(bert_client.encode(reference_summaries[i]))

    print("ENCODINGS COMPLETED, TERMINATING SERVER...")
    shutdown = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000'])
    BertServer.shutdown(shutdown)

    if n_gram_encoding == None:
        return candidate_embeddings, reference_embeddings
    elif n_gram_encoding >= 1:
        get_ngram_embedding_vectors(candidate_embeddings, reference_embeddings, n_gram_encoding, pooling_strategy, ) 
コード例 #9
0
def stop_server():
    shut_args = get_shutdown_parser().parse_args(['-port', '5555'])
    BertServer.shutdown(shut_args)
コード例 #10
0
ファイル: data_loader.py プロジェクト: lucas0/Lux
def load_data(emb_type='w2v', collapse_classes=False, fold=None, num_folds=1, random_state=None, force_reload=False, drop_feat_idx=[]):
    print('Loading data from',dataset_dir)
    data = pd.read_csv(dataset_dir+"/dataset.csv", sep=',')

    if force_reload: reset_hash()

    print("size of initial \"dataset\":",len(data))
    data = data.drop_duplicates(subset='o_url', keep='first')
    print("after dropping duplicates:",len(data))
    data.o_body = data.o_body.astype('str')
    data.verdict = data.verdict.astype('str')
    data['verdict'] = data['verdict'].str.lower()
    #data = data[data['o_body'].map(len) > MIN_BODY_LEN]
    #print("after dropping origins with less than "+str(MIN_BODY_LEN)+" chars:",len(data))
    data = data.reset_index()

    if(collapse_classes):
        print("labels before collapse classes:", data.verdict.unique())
        data.loc[data['verdict'] == "mfalse", 'verdict'] = 'false'
        data.loc[data['verdict'] == "mtrue", 'verdict'] = 'true'

    labels = ['true', 'false']
    print(data['verdict'].value_counts())
    data = data.loc[data.verdict.isin(labels)]
    print("considered labels:", data.verdict.unique())
    print("after dropping invalid labels:",len(data))

    #creating hash
    json_data = data.to_json().encode()
    data = data.sample(frac=1, random_state=random_state)
    df_hash = hashlib.sha256(json_data).hexdigest()

    labels_idx = [labels.index(label) for label in labels]
    labels_one_hot = np.eye(len(labels))[labels_idx]
    label_to_oh = {label:labels_one_hot[labels.index(label)] for label in labels}

    print("MEMORY: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    assert (num_folds > 2), "Needs at least three folds for Dev/Train/Test to be different from each other"
    #generate and save the folds:
    for fold in range(num_folds):
        bucket_size = int(len(data.index)/num_folds)
        fold_dev = fold+1
        if fold == num_folds-1:
            fold_dev = 0

    if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx):
        #TODO modify these two lines back!!!
        df = data[['o_body','verdict']].copy()
        #df = data[['claim','verdict']].copy()
        df = df.rename(columns={"o_body": "body"})
        #df = df.rename(columns={"claim": "body"})
        df.body.apply(clean_text)

        lens = np.asarray([len(e.split(" ")) for e in df['body'].values])
        #df = df[lens < MAX_SENT_LEN]
        df.reset_index(drop = True, inplace = True)
        df.to_csv(data_dir+'/data.csv', sep="\t", index=False)
        num_entries = len(df)

        #plots the data distribution by number of words
        print("Number of entries: ", num_entries)
        print("True/False: ",df.groupby('verdict').count())
        print("Mean and Std of number of words per document: ",np.mean(lens),np.std(lens), "\n")
        #sns.distplot(lens)
        #plt.show()

        ###################################
        ############# FEATURES ############
        ###################################
        #check if new linguistic features should be generated
        flag_concat = False
        if not check_hash(df_hash, num_folds, stage="complexity"):
            flag_concat = True
            #Generate the features ndarray and save it to a pickle
            try:
                feat.generate_complexity()
            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while GENERATING COMPLEXITY. Press any key to exit.")
                sys.exit(1)
            savehash("complexity", hashcode=df_hash)
        if not check_hash(df_hash, num_folds, stage="specificity"):
            flag_concat = True
            try:
                feat.generate_specificity()
            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while GENERATING SPECIFICITY. Press any key to exit.")
                sys.exit(1)
            savehash("specificity", hashcode=df_hash)

        if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="features"):
            flag_concat = True
            try:
                features = feat.generateFeats()
            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while GENERATING FEATURES. Press any key to exit.")
                sys.exit(1)
            save_p(data_dir+"/features", features)
            print("Generated Features. Saved to pickle.")
            print("Features Shape:", features.shape)
            savehash("features", hashcode=df_hash, drop_feat_idx=drop_feat_idx)

        #check if drop_features is NOT the same
        if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="drop_feat"):
            flag_concat = True
            savehash("drop_feat", hashcode=df_hash, drop_feat_idx=drop_feat_idx)

        print("MEMORY AFTER FEATURES: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        ####################################
        ############### BERT ###############
        ####################################
        #check if new bert should be generated
        if not check_hash(df_hash, num_folds, stage="bert"):
            try:
                #creates the shuffle order (not random)
                index_shuf = list(range(len(df)))

                #creates a list of N=folds lists, each inner list contains the index of the elements of each fold
                bert_folds = np.array_split(index_shuf, num_folds)
                bert_folds = [a.tolist() for a in bert_folds]

                #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry
                fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)]

                #I think this should start as True
                flag = {idx:True for idx in range(len(bert_folds))}

                #get the starting time:
                start_time = time.time()

                #start the bert-as-a-service server
                bert_dir = os.environ.get("BERT_BASE_DIR")
                print(bert_dir)
                args = get_args_parser().parse_args(['-model_dir', bert_dir, '-port', '5555', '-port_out', '5556', '-max_seq_len', '512', '-mask_cls_sep'])
                server = BertServer(args)
                server.start()

                print(num_folds)
                #delete the bert.csv files inside the folds
                for i in range(num_folds):
                    filename = data_dir+"/folds/"+str(i)+"/bert.csv"
                    if os.path.exists(filename):
                        subprocess.call("rm -rf "+filename, shell=True, cwd=data_dir)

                #TODO make this process read only one fold at a time
                for fold, idx in zip(fold_idx, index_shuf):

                    #generates the encodings for the texts
                    bc = BertClient(check_version=False)
                    b = bc.encode([df.body[idx]])[0]

                    bert_df = pd.DataFrame([b], columns=['f'+str(e) for e in range(len(b))])
                    bert_df.to_csv(data_dir+"/folds/"+str(fold)+"/bert.csv", mode='a+', index=False, header=flag[fold])
                    flag[fold] = False

                #stops the bert-as-a-service server
                shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000'])
                server.shutdown(shut_args)

                #print total time
                delta_time = time.time() - start_time
                print('Time Taken: for BERT generation:', time.strftime("%H:%M:%S",time.gmtime(delta_time)))


            except Exception as e:
                print(traceback.format_exc())
                input("Error occured while fine training BERT. Press any key to exit.")
                sys.exit(1)

            print("BERT Embeddings Saved")
            savehash("bert", df_hash)

        #########################################
        ## CONCATENATION, SHUFFLING AND SAVING ##
        #########################################

        #if not check_hash(df_hash, num_folds, stage="concat"):
        if flag_concat:
            features = read_p(data_dir+"/features")
            features = np.delete(features,drop_feat_idx,axis=1)

            #normalize features
            features = np.nan_to_num(features)
            features_t = features.T
            for c in range(features_t.shape[0]):
                row = features_t[c]
                features_t[c] = np.interp(row, (np.min(row), np.max(row)), (-2, +2))
            features = features_t.T
            #delete labels and folds folders
            for i in range(num_folds):
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/labels", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/features+bert.csv", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/bert", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/only_bert", shell=True, cwd=data_dir)

            #creates the shuffle order (not random)
            index_shuf = list(range(len(df)))

            #LABELS
            labels = [label_to_oh[label].tolist() for label in df['verdict'].values.tolist()]
            labels = [labels[i] for i in index_shuf]
            label_folds = np.array_split(labels, num_folds)

            for i in range(num_folds):
                fold_dir = data_dir+"/folds/"+str(i)
                if not os.path.exists(fold_dir):
                    os.mkdir(fold_dir)
                save_p(fold_dir+"/labels", label_folds[i])

            #creates a list of N=folds lists, each inner list contains the index of the elements of each fold
            bert_folds = np.array_split(index_shuf, num_folds)
            bert_folds = [a.tolist() for a in bert_folds]

            #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry
            fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)]

            #TODO make this process read only one fold at a time
            for fold in range(num_folds):
                b_fold_csv = pd.read_csv(data_dir+"/folds/"+str(fold)+"/bert.csv")
                #gets only the indexes
                count = sum([1 for fidx,_ in zip(fold_idx, index_shuf) if fold == fidx])
                for idx in range(count):
                    #print("csv:",b_fold_csv)
                    #print("len",len(b_fold_csv))
                    #print("count: ", count)
                    #print("range(count): ",range(count))
                    b = b_fold_csv.iloc[idx]
                    entry = np.concatenate((features[idx,:],b))

                    feat_df = pd.DataFrame([entry], columns=['f'+str(e) for e in range(len(entry))])
                    feat_df.to_csv(data_dir+"/folds/"+str(fold)+"/features+bert.csv", mode='a+', index=False, header=False)

            for i in range(num_folds):
                fold_dir = data_dir+"/folds/"+str(i)
                bert = np.genfromtxt(fold_dir+"/features+bert.csv", delimiter=',')
                only_bert = np.genfromtxt(fold_dir+"/bert.csv", delimiter=',')
                print("saving bert fold ",str(i), bert.shape)
                save_p(fold_dir+"/bert", bert)
                save_p(fold_dir+"/only_bert", only_bert)

            print("MEMORY AFTER FOLDS SAVING: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

            savehash("concat", hashcode=df_hash)

        checks = ["bert", "features", "concat", "complexity", "specificity"]

        for e in checks:
            print(e)
            print(check_hash(df_hash,num_folds,stage=e))
            if not (check_hash(df_hash,num_folds,stage=e, drop_feat_idx=drop_feat_idx)):
                print('Problem at Generation of data!')
                print("Stage: "+e)
                return

        print('Generation of data successfully done!')
        savehash("data", hashcode=df_hash)
        savehash("folds", hashcode=str(num_folds))

        return load_data(emb_type=emb_type, collapse_classes=collapse_classes, fold=fold, num_folds=num_folds, random_state=random_state, drop_feat_idx=drop_feat_idx)


    else:
        print("Reading already processed data")
        #returns the selected emb type (bert/w2v)
        test_data = read_p(data_dir+"/folds/"+str(fold)+"/"+emb_type)
        test_target = read_p(data_dir+"/folds/"+str(fold)+"/labels")

        dev_data = read_p(data_dir+"/folds/"+str(fold_dev)+"/"+emb_type)
        #dev_data = np.ndarray(dev_data)
        dev_target = read_p(data_dir+"/folds/"+str(fold_dev)+"/labels")

        train_data_filenames = [data_dir+"/folds/"+str(i)+"/"+emb_type for i in range(num_folds) if i not in [fold,fold_dev]]
        train_data = np.concatenate([read_p(fn) for fn in train_data_filenames], axis=0)
        train_target_filenames = [data_dir+"/folds/"+str(i)+"/labels" for i in range(num_folds) if i not in [fold,fold_dev]]
        train_target = np.concatenate([read_p(fn) for fn in train_target_filenames], axis=0)

        return train_data, train_target, dev_data, dev_target, test_data, test_target, label_to_oh
コード例 #11
0
        avg_sentiment_embeddings, avg_sentiment_embeddings_with_emojis = processing.average_word_embeddings_with_without_emojis(data, emojisInData)

        with open('data/whole_tweet_embeddings.json', 'w', encoding="utf8") as fp:
            json.dump(whole_sentiment_embeddings, fp, default=default)
        with open('data/whole_tweet_embeddings_with_emojis.json', 'w', encoding="utf8") as fp:
            json.dump(whole_sentiment_embeddings_with_emojis, fp, default=default)
        with open('data/avg_tweet_embeddings.json', 'w', encoding="utf8") as fp:
            json.dump(avg_sentiment_embeddings, fp, default=default)
        with open('data/avg_tweet_embeddings_with_emojis.json', 'w', encoding="utf8") as fp:
            json.dump(avg_sentiment_embeddings_with_emojis, fp, default=default)

        # if csv: # NOTE the ml version currently does not support cvs
        #     write_csv_embeddings(embeddings)
    
        shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000'])
        BertServer.shutdown(shut_args)
    else:
        with open('data/whole_tweet_embeddings.json') as fp:
            embeddings = json.load(fp)

    if args.ml:

        embeddingFileNames = ['data/whole_tweet_embeddings.json', 
            'data/whole_tweet_embeddings_with_emojis.json', 
            'data/avg_tweet_embeddings.json', 
            'data/avg_tweet_embeddings_with_emojis.json']

        for fileName in embeddingFileNames:
            fileNameUpdate = 'Using ' + fileName + ' as the embeddings'
            print(fileNameUpdate)