def main(): verify_cwd() if not os.path.exists("./data/"): print("creating folder...") os.mkdir("./data/") if not os.path.exists("./data/trees/"): print("preparing sentiment treebank...") try: pytreebank.load_sst("./data/") except: pass # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored. if not os.path.exists("./data/text8.zip"): print("retrieving text8...") urllib.request.urlretrieve("http://mattmahoney.net/dc/text8.zip", "./data/text8.zip") if not os.path.exists("./data/text8"): print("extracting text8...") with zipfile.ZipFile("./data/text8.zip", "r") as zip_ref: zip_ref.extractall("./data/") if not os.path.exists("./data/word2vec.model"): print("training word2vec...") train_word2vec() #if not os.path.exists("./data/glove.model"): glove training is slow. You should call it manually on create_pretrain_model.py # print("training glove...") # train_glove() print("=== ALL CLEAR! ===")
def main(): verify_cwd() try: pytreebank.load_sst("./data/") except: pass # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored. train_data = pytreebank.import_tree_corpus("./data/trees/train.txt") assert (str(train_data[0]) == TARGET_STRING), "test fail for pytreebank." print("Correctness verified.")
def train_word2vec(sentences=None, nr_feature=None, save_name=None): verify_cwd() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #gensim.models.Word2Vec, may be we need to train it later if sentences is None: sentences = word2vec.Text8Corpus("./data/text8") if save_name is None: save_name = "./data/word2vec.model" if nr_feature is None: nr_feature = 200 model = word2vec.Word2Vec(sentences, size=nr_feature) model.save(save_name)
def train_glove(sentences=None, nr_feature=None, save_name=None): verify_cwd() if sentences is None: print("preprocessing sentences...") sentences = list( itertools.islice(word2vec.Text8Corpus('./data/text8'), None)) print("{} sentences found.".format(len(sentences))) if save_name is None: save_name = "./data/glove.model" if nr_feature is None: nr_feature = 200 corpus = glove.Corpus() print("start fiting sentences...") corpus.fit(sentences, window=10) gl = glove.Glove(no_components=nr_feature, learning_rate=0.05) print("start training glove...") gl.fit(corpus.matrix, epochs=10, no_threads=multiprocessing.cpu_count(), verbose=True) corpus.save("./data/corpus.model") gl.save("./data/glove.model")
from definitions import verify_cwd from analyzer.common.dataset import Dataset from IPython import embed from tqdm import tqdm import pickle import numpy as np np.random.seed(42) if __name__ == "__main__": verify_cwd() words = set() for vals in [ Dataset.get_raw_train_dataset, Dataset.get_raw_val_dataset, Dataset.get_raw_test_dataset ]: for tree in tqdm(vals()): text = tree.to_lines()[0].split() for word in text: words.add(word.lower()) pickle.dump(words, open("./data/words.pkl", "wb")) glove_dict = pickle.load(open("./data/glove_300.pkl", "rb")) print(len(words)) missing = set() for word in words: if word not in glove_dict: missing.add(word) glove_dict[word] = np.random.rand(300) * 0.1 - 0.05 #glove_dict[word] = np.zeros((300,)) pickle.dump(glove_dict, open("./data/glove_300_aug.pkl", "wb")) print(len(missing)) pickle.dump(missing, open("./data/missing.pkl", "wb"))