def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5,
              sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0,
              dm_mean=0, train_words=True, train_lbls=True, **kwargs):
     """
     Initialize the model from an iterable of `sentences`. Each sentence is a
     LabeledSentence object that will be used for training.
     The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora,
     consider an iterable that streams the sentences directly from disk/network.
     If you don't supply `sentences`, the model is left uninitialized -- use if
     you plan to initialize it in some other way.
     `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used.
     Otherwise, `dbow` is employed.
     `size` is the dimensionality of the feature vectors.
     `window` is the maximum distance between the current and predicted word within a sentence.
     `alpha` is the initial learning rate (will linearly drop to zero as training progresses).
     `seed` = for the random number generator.
     `min_count` = ignore all words with total frequency lower than this.
     `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
             default is 0 (off), useful value is 1e-5.
     `workers` = use this many worker threads to train the model (=faster training with multicore machines).
     `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
     `negative` = if > 0, negative sampling will be used, the int for negative
     specifies how many "noise words" should be drawn (usually between 5-20).
     `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
     Only applies when dm is used.
     """
     Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count,
                       sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
                       sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs)
     self.train_words = train_words
     self.train_lbls = train_lbls
     if sentences is not None:
         self.build_vocab(sentences)
         self.train(sentences)
Ejemplo n.º 2
0
    def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5,
                 sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0,
                 dm_mean=0, train_words=True, train_lbls=True, **kwargs):
        """
        Initialize the model from an iterable of `sentences`. Each sentence is a
        LabeledSentence object that will be used for training.

        The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora,
        consider an iterable that streams the sentences directly from disk/network.

        If you don't supply `sentences`, the model is left uninitialized -- use if
        you plan to initialize it in some other way.

        `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used.
        Otherwise, `dbow` is employed.

        `size` is the dimensionality of the feature vectors.

        `window` is the maximum distance between the current and predicted word within a sentence.

        `alpha` is the initial learning rate (will linearly drop to zero as training progresses).

        `seed` = for the random number generator.

        `min_count` = ignore all words with total frequency lower than this.

        `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
                default is 0 (off), useful value is 1e-5.

        `workers` = use this many worker threads to train the model (=faster training with multicore machines).

        `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

        `negative` = if > 0, negative sampling will be used, the int for negative
        specifies how many "noise words" should be drawn (usually between 5-20).

        `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
        Only applies when dm is used.

        """
        Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count,
                          sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
                          sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs)
        self.train_words = train_words
        self.train_lbls = train_lbls
        if sentences is not None:
            self.build_vocab(sentences)
            self.train(sentences)
Ejemplo n.º 3
0
    def __init__(self, size=50, alpha=0.1, min_count=1, seed=1, workers=1, iter=1, use_gold=0, train_path=None,
                 test_raw_path=None, test_path=None, dev_path=None, quick_test=None, dict_path=None,
                 score_script_path=None, pre_train=False, uni_path=None, bi_path=None, hybrid_pred=False,
                 no_action_feature=False, no_bigram_feature=False, no_unigram_feature=False,
                 no_binary_action_feature=False, no_sb_state_feature=False, **kwargs):

        print '\n\n### Initialization of the segmentation model ###'

        self.no_action_feature = no_action_feature
        self.no_bigram_feature = no_bigram_feature
        self.no_unigram_feature = no_unigram_feature
        self.no_binary_action_feature = no_binary_action_feature
        self.no_sb_feature = no_sb_state_feature

        self.pre_train = pre_train
        self.l2_rate = 0.001  # rate for L2 regularization

        if self.l2_rate:
            print 'reg with L2, with param=', self.l2_rate
        self.drop_out = False

        self.finger_int = str(r_randint(0, 1000000))
        self.binary_pred = False
        self.hybrid_pred = hybrid_pred

        self.use_gold = use_gold
        # self.model = None
        self.START = "#S#"
        self.END = "#E#"

        self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab = "$LABEL0", "$LABEL1", "$OOV"

        self.su_prefix, self.sb_prefix = '$SU', '$SB'  # prefix for unigram/bigram state; no prefix for *char* unigram/bigrams
        self.state_varient = ('0', '1')

        self.train_path = train_path
        self.test_raw_path = test_raw_path
        self.test_path = test_path
        self.dev_path = dev_path
        self.quick_test = quick_test
        self.dict_path = dict_path
        self.score_script = score_script_path

        # self.score_script = '../working_data/score'
        # self.dict_path ='../working_data/pku.dict'

        print '\nloading train, test, dev corpus...'

        self.train_corpus = [l.split() for l in codecs.open(self.train_path, 'rU', 'utf-8')]
        self.test_corpus = [l.split() for l in codecs.open(self.test_raw_path, 'rU', 'utf-8')]
        self.dev_corpus = [l.split() for l in codecs.open(self.dev_path, 'rU', 'utf-8')]
        self.quick_test_corpus = [l.split() for l in codecs.open(self.quick_test, 'rU', 'utf-8')]

        Word2Vec.__init__(self, sentences=None, size=size, alpha=alpha, min_count=min_count, seed=seed, workers=workers,
                          iter=iter, **kwargs)

        self.mask = [1 for i in range(12)]

        if self.no_action_feature:
            self.mask = self.mask[:-3]
            print 'len mask', len(self.mask)
        elif self.no_sb_feature:
            self.mask = self.mask[:-1]
            print 'len mask', len(self.mask)

        if self.no_unigram_feature:
            self.mask = self.mask[:-5]
            print 'len mask', len(self.mask)

        if self.no_bigram_feature:
            self.mask = self.mask[:-4]
            print 'len mask', len(self.mask)

        self.f_factor = sum(self.mask)
        self.f_factor2 = 2

        if self.no_binary_action_feature:
            self.f_factor2 = 0
            print 'f-factor2=', self.f_factor2

        self.non_fixed_param = self.f_factor * self.layer1_size
        self.pred_size = self.non_fixed_param + self.f_factor2

        if self.drop_out:
            self.dropout_rate = 0.5
            self.dropout_size = int(self.dropout_rate * self.non_fixed_param)
            print 'using drop_out, rate/size=', self.dropout_rate, self.dropout_size

        self.train_mode = False

        self.dev_test_result = []

        print '\nLearning rate=', self.alpha, '; Feature (layer1) size=', self.layer1_size, '; Predicate vec size=', self.pred_size, 'f-factor=', self.f_factor, 'f-factor2=', self.f_factor2

        if self.pre_train:
            print '\nloading pre-trained char and char-bigram embeddings'
            self.uni_emb = Word2Vec.load(uni_path)
            emb_normalization(self.uni_emb)
            print 'unigram embedding loaded'
            self.bi_emb = Word2Vec.load(bi_path)
            emb_normalization(self.bi_emb)
            print 'bigram embedding loaded'
Ejemplo n.º 4
0
    def __init__(self,
                 size=50,
                 alpha=0.1,
                 min_count=1,
                 seed=1,
                 workers=1,
                 iter=1,
                 use_gold=0,
                 train_path=None,
                 test_raw_path=None,
                 test_path=None,
                 dev_path=None,
                 quick_test=None,
                 dict_path=None,
                 score_script_path=None,
                 pre_train=False,
                 uni_path=None,
                 bi_path=None,
                 hybrid_pred=False,
                 no_action_feature=False,
                 no_bigram_feature=False,
                 no_unigram_feature=False,
                 no_binary_action_feature=False,
                 no_sb_state_feature=False,
                 **kwargs):

        print '\n\n### Initialization of the segmentation model ###'

        self.no_action_feature = no_action_feature
        self.no_bigram_feature = no_bigram_feature
        self.no_unigram_feature = no_unigram_feature
        self.no_binary_action_feature = no_binary_action_feature
        self.no_sb_feature = no_sb_state_feature

        self.pre_train = pre_train
        self.l2_rate = 0.001  # rate for L2 regularization

        if self.l2_rate:
            print 'reg with L2, with param=', self.l2_rate
        self.drop_out = False

        self.finger_int = str(r_randint(0, 1000000))
        self.binary_pred = False
        self.hybrid_pred = hybrid_pred

        self.use_gold = use_gold
        # self.model = None
        self.START = "#S#"
        self.END = "#E#"

        self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab = "$LABEL0", "$LABEL1", "$OOV"

        self.su_prefix, self.sb_prefix = '$SU', '$SB'  # prefix for unigram/bigram state; no prefix for *char* unigram/bigrams
        self.state_varient = ('0', '1')

        self.train_path = train_path
        self.test_raw_path = test_raw_path
        self.test_path = test_path
        self.dev_path = dev_path
        self.quick_test = quick_test
        self.dict_path = dict_path
        self.score_script = score_script_path

        # self.score_script = '../working_data/score'
        # self.dict_path ='../working_data/pku.dict'

        print '\nloading train, test, dev corpus...'

        self.train_corpus = [
            l.split() for l in codecs.open(self.train_path, 'rU', 'utf-8')
        ]
        self.test_corpus = [
            l.split() for l in codecs.open(self.test_raw_path, 'rU', 'utf-8')
        ]
        self.dev_corpus = [
            l.split() for l in codecs.open(self.dev_path, 'rU', 'utf-8')
        ]
        self.quick_test_corpus = [
            l.split() for l in codecs.open(self.quick_test, 'rU', 'utf-8')
        ]

        Word2Vec.__init__(self,
                          sentences=None,
                          size=size,
                          alpha=alpha,
                          min_count=min_count,
                          seed=seed,
                          workers=workers,
                          iter=iter,
                          **kwargs)

        self.mask = [1 for i in range(12)]

        if self.no_action_feature:
            self.mask = self.mask[:-3]
            print 'len mask', len(self.mask)
        elif self.no_sb_feature:
            self.mask = self.mask[:-1]
            print 'len mask', len(self.mask)

        if self.no_unigram_feature:
            self.mask = self.mask[:-5]
            print 'len mask', len(self.mask)

        if self.no_bigram_feature:
            self.mask = self.mask[:-4]
            print 'len mask', len(self.mask)

        self.f_factor = sum(self.mask)
        self.f_factor2 = 2

        if self.no_binary_action_feature:
            self.f_factor2 = 0
            print 'f-factor2=', self.f_factor2

        self.non_fixed_param = self.f_factor * self.layer1_size
        self.pred_size = self.non_fixed_param + self.f_factor2

        if self.drop_out:
            self.dropout_rate = 0.5
            self.dropout_size = int(self.dropout_rate * self.non_fixed_param)
            print 'using drop_out, rate/size=', self.dropout_rate, self.dropout_size

        self.train_mode = False

        self.dev_test_result = []

        print '\nLearning rate=', self.alpha, '; Feature (layer1) size=', self.layer1_size, '; Predicate vec size=', self.pred_size, 'f-factor=', self.f_factor, 'f-factor2=', self.f_factor2

        if self.pre_train:
            print '\nloading pre-trained char and char-bigram embeddings'
            self.uni_emb = Word2Vec.load(uni_path)
            emb_normalization(self.uni_emb)
            print 'unigram embedding loaded'
            self.bi_emb = Word2Vec.load(bi_path)
            emb_normalization(self.bi_emb)
            print 'bigram embedding loaded'