Esempio n. 1
0
def ft_process(data_url=None):
    """ process data into what ft model need, and save it into './processed_data' dir

    Args:
        data_url: url to original .csv data

    Returns:
        str: url to saved processed data

    """
    save_filename = basename(data_url).replace('.csv', '_ft.csv')
    save_url = from_project_root("embedding_model/processed_data/" + save_filename)

    # file specified by data_url is already processed
    if exists(save_url):
        return save_url
    if data_url is not None:
        labels, sentences = load_raw_data(data_url)
    else:
        train_df = load_to_df(TRAIN_URL)
        labels = train_df['class'].values
        sentences = train_df['word_seg']

    with open(save_url, "w", encoding='utf-8', newline='\n') as ft_file:
        for i in range(len(labels)):
            label = FT_LABEL_PREFIX + str(labels[i])
            sentence = ' '.join(sentences[i])
            ft_file.write('{} {}\n'.format(label, sentence))
    return save_url
Esempio n. 2
0
def train_ft_model(args, data_url=None):
    """ load the ft model or train a new one

    Args:
        data_url: train data url
        args: args for model

    Returns:
        ft model

    """
    if data_url is None or not data_url.endswith('_ft.csv'):
        data_url = ft_process(data_url)

    # model specified by model_url is already trained and saved
    model_url = args_to_url(data_url, args)
    if exists(model_url):
        return ft.load_model(model_url, label_prefix=FT_LABEL_PREFIX)

    print("fasttext model is training, model will be saved at\n ", model_url)
    model_url = model_url[:-4]  # ft will add .bin automatically

    '''
    List of available params and their default value:

        input_file     			training file path (required)
        output         			output file path (required)
        label_prefix   			label prefix ['__label__']
        lr             			learning rate [0.1]
        lr_update_rate 			change the rate of updates for the learning rate [100]
        dim            			size of word vectors [100]
        ws             			size of the context window [5]
        epoch          			number of epochs [5]
        min_count      			minimal number of word occurences [1]
        neg            			number of negatives sampled [5]
        word_ngrams    			max length of word ngram [1]
        loss           			loss function {ns, hs, softmax} [softmax]
        bucket         			number of buckets [0]
        minn           			min length of char ngram [0]
        maxn           			max length of char ngram [0]
        thread         			number of threads [12]
        t              			sampling threshold [0.0001]
        silent         			disable the log output from the C++ extension [1]
        encoding       			specify input_file encoding [utf-8]
        pretrained_vectors		pretrained word vectors (.vec file) for supervised learning []
        
    '''

    s_time = time()
    clf = ft.supervised(data_url, model_url, thread=N_JOBS, label_prefix=FT_LABEL_PREFIX, **args)
    e_time = time()
    print("training finished in %.3f second\n" % (e_time - s_time))
    return clf
Esempio n. 3
0
def main():
    start_time = datetime.now()
    if EMBED_URL and not exists(EMBED_URL):
        pretrained_url = from_project_root(
            "data/embedding/PubMed-shuffle-win-30.bin")
        prepare_vocab([TRAIN_URL, DEV_URL, TEST_URL],
                      pretrained_url,
                      update=True)
    train(test_url=TEST_URL)
    print("finished in:")
    print(datetime.now() - start_time)
    pass
Esempio n. 4
0
def main():
    start_time = datetime.now()
    if EMBED_URL and not exists(EMBED_URL):
        # pre-trained embedding url, word2vec format
        pretrained_url = PRETRAINED_URL
        prepare_vocab([TRAIN_URL, DEV_URL, TEST_URL],
                      pretrained_url,
                      update=True,
                      min_count=1)
    train_end2end(test_url=TEST_URL)
    print("finished in:", datetime.now() - start_time)
    pass
Esempio n. 5
0
def train_w2v_model(data_url=None, kwargs=None):
    """ get or train a new d2v_model

    Args:
        data_url: url to data file, None to train use
        kwargs: args for d2v model

    Returns:
        w2v_model

    """
    model_url = args_to_url(kwargs)
    if exists(model_url):
        return Word2Vec.load(model_url)

    if data_url is not None:
        _, sequences = load_raw_data(data_url)

    # use data from all train text and test text
    else:
        train_df = load_to_df(TRAIN_URL)
        test_df = load_to_df(TEST_URL)
        sequences = train_df['word_seg'].append(test_df['word_seg'],
                                                ignore_index=True)
        sequences = sequences.apply(str.split)

    print("Word2Vec model is training...\n trained model will be saved at \n ",
          model_url)
    s_time = time()
    # more info here [https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec]
    model = Word2Vec(sequences, workers=N_JOBS, **kwargs)
    e_time = time()
    print("training finished in %.3f seconds" % (e_time - s_time))
    model.save(model_url)
    # save wv of model
    wv_save_url = model_url.replace('.bin', '.txt').replace('w2v', 'wv')
    model.wv.save_word2vec_format(wv_save_url, binary=False)
    return model
Esempio n. 6
0
def train_d2v_model(data_url=None, kwargs=None):
    """ get or train a new d2v_model

    Args:
        data_url: url to data file
        kwargs: args for d2v model

    Returns:
        w2v_model

    """
    model_url = args_to_url(kwargs)
    if exists(model_url):
        return Doc2Vec.load(model_url)

    if data_url is not None:
        _, sequences = load_raw_data(data_url)

    # use data from all train text and test text
    else:
        train_df = load_to_df(TRAIN_URL)
        test_df = load_to_df(TEST_URL)
        sequences = train_df['word_seg'].append(test_df['word_seg'],
                                                ignore_index=True)
        sequences = sequences.apply(str.split)

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sequences)]
    print("Doc2Vec model is training...\n trained model will be saved at \n ",
          model_url)
    # more info here [https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec]
    s_time = time()
    model = Doc2Vec(documents, workers=N_JOBS, **kwargs)
    model.save(model_url)
    e_time = time()
    print("training finished in %.3f seconds" % (e_time - s_time))
    return model