Ejemplo n.º 1
0
    def __init__(self, args):
        if args.aminer:
            dataset = AminerDataset(args.path)
        else:
            dataset = CustomDataset(args.path)
        self.data = DataReader(dataset, args.min_count, args.care_type)
        dataset = Metapath2vecDataset(self.data, args.window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers,
                                     collate_fn=dataset.collate)

        self.output_file_name = args.output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = args.dim
        self.batch_size = args.batch_size
        self.iterations = args.iterations
        self.initial_lr = args.initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()
Ejemplo n.º 2
0
    def __init__(self,
                 inFile,
                 outFile,
                 prFile=None,
                 emb_dimensions=100,
                 batch_size=512,
                 window_size=5,
                 iterations=50,
                 initial_lr=0.003):

        self.data = DataReader(inFile, txtFile=prFile)
        dataset = Word2VecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = outFile
        self.emb_size = len(self.data.word2id)
        self.batch_size = batch_size
        self.emb_dimensions = emb_dimensions
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size,
                                             self.emb_dimensions)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda:0' if self.use_cuda else 'cpu')

        if self.use_cuda:
            self.skip_gram_model.cuda()
Ejemplo n.º 3
0
def main(args):
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    graph = load(args.dataset)

    model = SkipGramModel(
        graph.num_nodes,
        args.embed_size,
        args.neg_num,
        sparse=not args.use_cuda)
    model = paddle.DataParallel(model)

    optim = Adam(
        learning_rate=args.learning_rate,
        parameters=model.parameters(),
        weight_decay=args.weight_decay)

    train_ds = ShardedDataset(graph.nodes)
    collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                               args.neg_num, args.neg_sample_type)
    data_loader = Dataloader(
        train_ds,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.sample_workers,
        collate_fn=collate_fn)

    for epoch in tqdm.tqdm(range(args.epoch)):
        train_loss = train(model, data_loader, optim)
        log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
Ejemplo n.º 4
0
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=300,
                 batch_size=64,
                 window_size=5,
                 iterations=5,
                 initial_lr=1.0,
                 min_count=5):

        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            print("USING CUDA")
            self.skip_gram_model.cuda()
        else:
            print("CUDA FAIL")
Ejemplo n.º 5
0
def main(args):
    if not args.use_cuda:
        paddle.set_device("cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    graph = load(args.dataset)

    model = SkipGramModel(graph.num_nodes,
                          args.embed_size,
                          args.neg_num,
                          sparse=not args.use_cuda)
    model = paddle.DataParallel(model)

    train_steps = int(graph.num_nodes / args.batch_size) * args.epoch
    scheduler = paddle.optimizer.lr.PolynomialDecay(
        learning_rate=args.learning_rate,
        decay_steps=train_steps,
        end_lr=0.0001)

    optim = Adam(learning_rate=scheduler, parameters=model.parameters())

    train_ds = ShardedDataset(graph.nodes)
    collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                               args.neg_num, args.neg_sample_type)
    data_loader = Dataloader(train_ds,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.sample_workers,
                             collate_fn=collate_fn)

    for epoch in tqdm.tqdm(range(args.epoch)):
        train_loss = train(model, data_loader, optim)
        log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
    paddle.save(model.state_dict(), "model.pdparams")
Ejemplo n.º 6
0
    def __init__(self, path="output", time_type="word_sin"):
        # for time_type in os.listdir(path):
        #     if ".DS_Store" in time_type:
        # continue
        self.path = path
        subpath = os.path.join(path, time_type)
        if args.add_phase_shift:
            subpath += "_shift"
        if not os.path.exists(os.path.join(subpath, "vectors.txt")):
            print("cannot find vectors.txt in {}, try to find {}-th iteration".
                  format(subpath, args.iterations))
            subpath = os.path.join(subpath, str(args.iterations - 1))
            if not os.path.exists(subpath):
                print("cannot load model from {}".format(subpath))
                return
        self.embedding_dict = read_embeddings_from_file(
            os.path.join(subpath, "vectors.txt"))
        if args.use_time and "word2vec" not in time_type:
            self.skip_gram_model = TimestampedSkipGramModel(
                len(self.embedding_dict),
                args.emb_dimension,
                time_type=time_type,
                add_phase_shift=args.add_phase_shift)
        else:
            self.skip_gram_model = SkipGramModel(len(self.embedding_dict),
                                                 args.emb_dimension)

        self.id2word = pickle.load(
            open(os.path.join(subpath, "dict.pkl"), "rb"))
        self.skip_gram_model.load_embeddings(self.id2word, subpath)

        if torch.cuda.is_available():
            self.skip_gram_model.cuda()
Ejemplo n.º 7
0
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12):

        print("Reading input file...")
        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        print("Creating data batches")
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()
Ejemplo n.º 8
0
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=50,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=5):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
Ejemplo n.º 9
0
 def __init__(self, log_filename: str,
              output_filename: str,
              embedding_dimension: int=100,
              batch_size: int=128,
              iteration: int=1,
              initial_lr: float=0.025,
              min_count: int=5,
              sub_sampling_t: float = 1e-5,
              neg_sampling_t: float = 0.75,
              neg_sample_count: int = 5,
              half_window_size: int = 2,
              read_data_method: str='memory'):
     """
     init func
     """
     self.data = DataHanlder(log_filename=log_filename,
                             batch_size=batch_size,
                             min_count=min_count,
                             sub_sampling_t=sub_sampling_t,
                             neg_sampling_t=neg_sampling_t,
                             neg_sample_count=neg_sample_count,
                             half_window_size=half_window_size,
                             read_data_method=read_data_method)
     self.output_filename = output_filename
     self.embedding_dimension = embedding_dimension
     self.batch_size = batch_size
     self.half_window_size = half_window_size
     self.iter = iteration
     self.initial_lr = initial_lr
     self.sg_model = SkipGramModel(len(self.data.vocab), self.embedding_dimension)
     self.use_cuda = torch.cuda.is_available()
     if self.use_cuda:
         self.sg_model.cuda()
     self.optimizer = optim.SGD(self.sg_model.parameters(), lr=self.initial_lr)
Ejemplo n.º 10
0
 def __init__(self, ifolder, ofolder, 
              emb_dimension=400,
              batch_size=32,
              iteration=int(sys.argv[3]),
              initial_lr=0.025):
     
     self.ifolder = ifolder
     
     self.outfolder = ofolder+ifolder.rsplit('/',2)[1]+'/'
     try:
         os.makedirs(self.outfolder)
     except:
         print(self.outfolder+ " folder exists. Will be overwritten")
     
     self.emb_dimension = emb_dimension
     self.initial_lr = initial_lr
     self.iteration = iteration
     self.batch_size = batch_size
     self.fpos = 0
     self.fneg = 0
     
     self.id2word = dict()
     self.id2pair = dict()
     self.pair2id = dict()
     
     self.read_word_dict(ifolder+"Word2Id")
     self.read_pair_dict(ifolder+"Pair2Id")
     
     self.pair_count = self.evaluate_pair_count()
     self.positive_pairs = np.zeros((self.pair_count, 2), dtype=int)
    
     # Dummy values to ensure size does not change
     self.negative_pairs = np.zeros((self.pair_count, 5), dtype=int)
     
     print(" Size of :", sys.getsizeof(self.positive_pairs))
     print(" Size of :", sys.getsizeof(self.negative_pairs))
     #ipdb.set_trace()
     
     self.emb_size     = len(self.id2word)
     self.pair_emb_size = len(self.id2pair)
     
     
     self.skip_gram_model = SkipGramModel(self.pair_emb_size,self.emb_size, self.emb_dimension)
     self.use_cuda = torch.cuda.is_available()
     
     if self.use_cuda:
         self.skip_gram_model.cuda()
     self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
     
     print("Start reading pairs")
Ejemplo n.º 11
0
    def __init__(self, wikidump_filename, output_text_filename, emb_dimension,
                 batch_size, window_size, iteration, initial_lr, min_count):

        self.data = InputData(wikidump_filename, min_count,
                              output_text_filename)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
Ejemplo n.º 13
0
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=500,
                 batch_size=32,
                 window_size=7,
                 iteration=3,
                 initial_lr=0.025,
                 min_count=200,
                 pair_min_count=100,
                 k_value=6):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count, pair_min_count,
                              window_size)
        self.output_file_name = output_file_name
        self.pair_emb_size = len(
            self.data.pair2id
        )  #number of words above min_count Change it to pairs count
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.k_value = k_value
        self.skip_gram_model = SkipGramModel(self.pair_emb_size, self.emb_size,
                                             self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()

        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
Ejemplo n.º 14
0
    def init_device_emb(self):
        """ set the device before training 
        will be called once in fast_train_mp / fast_train
        """
        choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
        assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"
        choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd])
        assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]"

        # initializing embedding on CPU
        self.emb_model = SkipGramModel(
            emb_size=self.emb_size,
            emb_dimension=self.args.dim,
            walk_length=self.args.walk_length,
            window_size=self.args.window_size,
            batch_size=self.args.batch_size,
            only_cpu=self.args.only_cpu,
            only_gpu=self.args.only_gpu,
            mix=self.args.mix,
            neg_weight=self.args.neg_weight,
            negative=self.args.negative,
            lr=self.args.lr,
            lap_norm=self.args.lap_norm,
            adam=self.args.adam,
            sgd=self.args.sgd,
            avg_sgd=self.args.avg_sgd,
            fast_neg=self.args.fast_neg,
            record_loss=self.args.print_loss,
            norm=self.args.norm,
            use_context_weight=self.args.use_context_weight,
        )

        torch.set_num_threads(self.args.num_threads)
        if self.args.only_gpu:
            print("Run in 1 GPU")
            assert self.args.gpus[0] >= 0
            self.emb_model.all_to_device(self.args.gpus[0])
        elif self.args.mix:
            print("Mix CPU with %d GPU" % len(self.args.gpus))
            if len(self.args.gpus) == 1:
                assert self.args.gpus[
                    0] >= 0, 'mix CPU with GPU should have abaliable GPU'
                self.emb_model.set_device(self.args.gpus[0])
        else:
            print("Run in CPU process")
            self.args.gpus = [torch.device('cpu')]
Ejemplo n.º 15
0
 def __init__(self,
              input_file_name,
              output_file_name):
     self.min_count = 5
     self.emb_dimension = 100
     self.batch_size = 64
     self.window_size = 5
     self.iteration = 1
     self.initial_lr = 0.001
     self.data = InputData(input_file_name, self.min_count)
     self.output_file_name = output_file_name
     self.emb_size = len(self.data.word2id)
     self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                          self.iteration, self.initial_lr, self.min_count)
     self.use_cuda = torch.cuda.is_available()
     if self.use_cuda:
         self.skip_gram_model.cuda()
     self.optimizer = optim.SGD(
         self.skip_gram_model.parameters(), lr=self.initial_lr)
Ejemplo n.º 16
0
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):

        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)
Ejemplo n.º 17
0
def main(args):
    if not args.use_cuda:
        paddle.set_device("cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    if args.edge_file:
        graph = load_from_file(args.edge_file)
    else:
        graph = load(args.dataset)

    edges = np.load("./edges.npy")
    edges = np.concatenate([edges, edges[:, [1, 0]]])
    graph = pgl.Graph(edges)

    model = SkipGramModel(graph.num_nodes,
                          args.embed_size,
                          args.neg_num,
                          sparse=not args.use_cuda)
    model = paddle.DataParallel(model)

    train_ds = ShardedDataset(graph.nodes, repeat=args.epoch)

    train_steps = int(len(train_ds) // args.batch_size)
    log.info("train_steps: %s" % train_steps)
    scheduler = paddle.optimizer.lr.PolynomialDecay(
        learning_rate=args.learning_rate,
        decay_steps=train_steps,
        end_lr=0.0001)

    optim = Adam(learning_rate=scheduler, parameters=model.parameters())

    collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                               args.neg_num, args.neg_sample_type)
    data_loader = Dataloader(train_ds,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.sample_workers,
                             collate_fn=collate_fn)

    train_loss = train(model, data_loader, optim)
    paddle.save(model.state_dict(), "model.pdparams")
Ejemplo n.º 18
0
    def __init__(self,output_file_name,
            walks = [],
            emb_dimension=100,
            batch_size=64,
            window_size=5,
            epochs=5,
            negative_num=5):
        print("Load data...")
        self.data = InputData(window_size, batch_size, walks)
        self.output_file_name = output_file_name
        self.emb_dimension = emb_dimension
        self.epochs = epochs
        self.negative_num = negative_num
        self.batch_size = batch_size
        self.vocab_size = self.data.vocab_size
        self.model = SkipGramModel(self.vocab_size, self.emb_dimension)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1.0)

        if cuda_gpu:
            self.model = self.model.cuda()
Ejemplo n.º 19
0
Archivo: line.py Proyecto: yuk12/dgl
    def init_device_emb(self):
        """ set the device before training 
        will be called once in fast_train_mp / fast_train
        """
        choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
        assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"

        # initializing embedding on CPU
        self.emb_model = SkipGramModel(
            emb_size=self.emb_size,
            emb_dimension=self.args.dim,
            batch_size=self.args.batch_size,
            only_cpu=self.args.only_cpu,
            only_gpu=self.args.only_gpu,
            only_fst=self.args.only_fst,
            only_snd=self.args.only_snd,
            mix=self.args.mix,
            neg_weight=self.args.neg_weight,
            negative=self.args.negative,
            lr=self.args.lr,
            lap_norm=self.args.lap_norm,
            fast_neg=self.args.fast_neg,
            record_loss=self.args.print_loss,
            async_update=self.args.async_update,
            num_threads=self.args.num_threads,
        )

        torch.set_num_threads(self.args.num_threads)
        if self.args.only_gpu:
            print("Run in 1 GPU")
            assert self.args.gpus[0] >= 0
            self.emb_model.all_to_device(self.args.gpus[0])
        elif self.args.mix:
            print("Mix CPU with %d GPU" % len(self.args.gpus))
            if len(self.args.gpus) == 1:
                assert self.args.gpus[
                    0] >= 0, 'mix CPU with GPU should have avaliable GPU'
                self.emb_model.set_device(self.args.gpus[0])
        else:
            print("Run in CPU process")
Ejemplo n.º 20
0
 def init_device_emb(self):
     """ set the device before training 
     will be called once in fast_train_mp / fast_train
     """
     choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
     assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"
     assert self.args.num_procs >= 1, "The number of process must be larger than 1"
     choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd])
     assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]"
     
     # initializing embedding on CPU
     self.emb_model = SkipGramModel(
         emb_size=self.emb_size, 
         emb_dimension=self.args.dim,
         walk_length=self.args.walk_length,
         window_size=self.args.window_size,
         batch_size=self.args.batch_size,
         only_cpu=self.args.only_cpu,
         only_gpu=self.args.only_gpu,
         mix=self.args.mix,
         neg_weight=self.args.neg_weight,
         negative=self.args.negative,
         lr=self.args.lr,
         lap_norm=self.args.lap_norm,
         adam=self.args.adam,
         sgd=self.args.sgd,
         avg_sgd=self.args.avg_sgd,
         fast_neg=self.args.fast_neg,
         )
     
     torch.set_num_threads(self.args.num_threads)
     if self.args.only_gpu:
         print("Run in 1 GPU")
         self.emb_model.all_to_device(0)
     elif self.args.mix:
         print("Mix CPU with %d GPU" % self.args.num_procs)
         if self.args.num_procs == 1:
             self.emb_model.set_device(0)
     else:
         print("Run in %d CPU process" % self.args.num_procs)
Ejemplo n.º 21
0
    def __init__(self, args, graph):
        print("\nPerforming Node2vec...\n")
        # 1. generate walker
        walker = DeepWalker(args, graph)
        print("\nDoing deepwalks...\n")
        walker.create_features()

        self.inputFileName = "{}{}-deepwalk_{}-num_walks_{}-len_metapath.txt".format(
            args.input_path, args.idx_metapath, args.number_of_walks,
            args.walk_length)

        # 2. read data
        self.data = DataReader(args.min_count, args.care_type,
                               self.inputFileName)

        # 3. make dataset for training
        dataset = DatasetLoader(self.data, args.window_size)

        # 4. initialize dataloader
        self.dataloader = DataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers,
                                     collate_fn=dataset.collate)

        self.output_file_name = "{}{}-embedding_{}-deepwalk_{}-dim_{}-initial_lr_{}-window_size_{}-iterations_{}-min_count.pickle".format(
            args.output_path, args.idx_embed, args.idx_metapath, args.dim,
            args.initial_lr, args.window_size, args.iterations, args.min_count)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = args.dim
        self.batch_size = args.batch_size
        self.iterations = args.iterations
        self.initial_lr = args.initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()
Ejemplo n.º 22
0
    def __init__(self, file, min_count, window_size, batch_size, output_file,
                 dim, iterations, initial_lr):
        self.data = DataReader(file, min_count)
        dataset = Metapath2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     num_workers=4,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = dim
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr  #learning rate
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()
class Word2Vec:
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):

        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        count = int(batch_count) // 3
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)

            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u)).cuda()
            pos_v = Variable(torch.LongTensor(pos_v)).cuda()
            neg_v = Variable(torch.LongTensor(neg_v)).cuda()
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.item(),
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            if i != 0 and i % count == 0:
                self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i))
        self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
Ejemplo n.º 24
0
    def __init__(
        self,
        input_path,
        output_dir,
        wordsim_path,
        dimension=100,
        batch_size=batch_size,
        window_size=5,
        epoch_count=1,
        initial_lr=1e-6,
        min_count=5,
    ):
        self.data = InputData(input_path, min_count)
        self.output_dir = output_dir
        self.vocabulary_size = len(self.data.id_from_word)
        self.dimension = dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.epoch_count = epoch_count
        self.initial_lr = initial_lr
        self.model = SkipGramModel(self.vocabulary_size, self.dimension)
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model = nn.DataParallel(self.model.to(self.device))
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr)

        if wordsim_path:
            self.wordsim_verification_tuples = []
            with open(wordsim_path, 'r') as f:
                f.readline()  # Abandon header
                for line in f:
                    word1, word2, actual_similarity = line.split(',')
                    self.wordsim_verification_tuples.append(
                        (word1, word2, float(actual_similarity))
                    )
        else:
            self.wordsim_verification_tuples = None
Ejemplo n.º 25
0
Archivo: train.py Proyecto: WenjinW/PGL
def main(config, ip_list_file):
    ds = TrainPairDataset(config, ip_list_file)
    loader = Dataloader(
        ds,
        batch_size=config.batch_pair_size,
        num_workers=config.num_workers,
        stream_shuffle_size=config.pair_stream_shuffle_size,
        collate_fn=CollateFn())

    model = SkipGramModel(config)

    if config.warm_start_from:
        log.info("warm start from %s" % config.warm_start_from)
        model.set_state_dict(paddle.load(config.warm_start_from))

    optim = Adam(
        learning_rate=config.lr,
        parameters=model.parameters(),
        lazy_mode=config.lazy_mode)

    log.info("starting training...")
    train(config, model, loader, optim)
Ejemplo n.º 26
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name):
        self.min_count = 5
        self.emb_dimension = 100
        self.batch_size = 64
        self.window_size = 5
        self.iteration = 1
        self.initial_lr = 0.001
        self.data = InputData(input_file_name, self.min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                             self.iteration, self.initial_lr, self.min_count)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(
            self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.data,
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(
            self.data.id2word, self.output_file_name, self.use_cuda)
Ejemplo n.º 27
0
def StaticSkipGramModel(num_nodes,
                        neg_num,
                        embed_size,
                        num_emb_part=8,
                        shared_embedding=False):
    src = F.data("src", shape=[-1, 1], dtype="int64")
    dsts = F.data("dsts", shape=[-1, neg_num + 1], dtype="int64")
    model = SkipGramModel(num_nodes,
                          embed_size,
                          neg_num,
                          num_emb_part,
                          shared_embedding=shared_embedding)
    loss = model(src, dsts)
    return loss
Ejemplo n.º 28
0
class Metapath2VecTrainer:
    def __init__(self, args):
        if args.aminer:
            dataset = AminerDataset(args.path)
        else:
            dataset = CustomDataset(args.path)
        self.data = DataReader(dataset, args.min_count, args.care_type)
        dataset = Metapath2vecDataset(self.data, args.window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers,
                                     collate_fn=dataset.collate)

        self.output_file_name = args.output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = args.dim
        self.batch_size = args.batch_size
        self.iterations = args.iterations
        self.initial_lr = args.initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):

        for iteration in range(self.iterations):
            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                         lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 500 == 0:
                        print(" Loss: " + str(running_loss))

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)
Ejemplo n.º 29
0
def StaticSkipGramModel(num_nodes,
                        neg_num,
                        embed_size,
                        sparse=False,
                        sparse_embedding=False):
    src = F.data("src", shape=[-1, 1], dtype="int64")
    dsts = F.data("dsts", shape=[-1, neg_num + 1], dtype="int64")
    py_reader = paddle.fluid.io.DataLoader.from_generator(
        capacity=64,
        feed_list=[src, dsts],
        iterable=False,
        use_double_buffer=False)
    model = SkipGramModel(num_nodes, embed_size, neg_num, sparse=sparse,
                          sparse_embedding=sparse_embedding)
    loss = model(src, dsts)
    return py_reader, loss
Ejemplo n.º 30
0
def train(args):

    data = InputData(args.input, args.min_count, args.sample)
    output_file_name = args.output
    emb_size = len(data.word2id)
    emb_dimension = args.dim
    batch_size = args.mb
    window_size = args.window
    n_negs = args.n_negs
    iteration = args.iters
    initial_lr = args.lr
    use_cuda = args.cuda

    skip_gram_model = SkipGramModel(emb_size, emb_dimension)
    if use_cuda: skip_gram_model = skip_gram_model.cuda()

    optimizer = optim.SGD(skip_gram_model.parameters(), lr=initial_lr)

    pair_count = data.evaluate_pair_count(window_size)
    batch_count = iteration * pair_count / batch_size
    process_bar = tqdm(range(int(batch_count)))

    # skip_gram_model.save_embedding(
    #     data.id2word, 'begin_embedding.txt', use_cuda)

    for i in process_bar:
        pos_pairs = data.get_batch_pairs(batch_size, window_size)
        neg_v = data.get_neg_v_neg_sampling(pos_pairs, n_negs)
        pos_u = [pair[0] for pair in pos_pairs]
        pos_v = [pair[1] for pair in pos_pairs]

        pos_u = torch.LongTensor(pos_u)
        pos_v = torch.LongTensor(pos_v)
        neg_v = torch.LongTensor(neg_v)
        if use_cuda:
            pos_u = pos_u.cuda()
            pos_v = pos_v.cuda()
            neg_v = neg_v.cuda()

        optimizer.zero_grad()
        loss = skip_gram_model(pos_u, pos_v, neg_v)
        loss.backward()
        optimizer.step()

        process_bar.set_description(
            "\rLoss: %0.8f, lr: %0.6f" %
            (loss.item(), optimizer.param_groups[0]['lr']))

        if i * batch_size % 100000 == 0:
            lr = initial_lr * (1.0 - 1.0 * i / batch_count)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

    skip_gram_model.save_embedding(data.id2word, output_file_name, use_cuda)