Ejemplo n.º 1
0
    def __init__(self,
                 keywords,
                 sentences,
                 corpus_file,
                 size,
                 alpha,
                 word_ngrams,
                 min_n,
                 max_n,
                 bucket,
                 corpus_worker,
                 corpus_chunksize,
                 case_sensitive,
                 window=5,
                 min_count=5,
                 max_vocab_size=None,
                 sample=0.001,
                 seed=1,
                 workers=3,
                 min_alpha=0.0001,
                 sg=0,
                 hs=0,
                 negative=5,
                 ns_exponent=0.75,
                 cbow_mean=1,
                 iter=5,
                 null_word=0,
                 trim_rule=None,
                 sorted_vocab=1,
                 batch_words=10000):

        # 20181130 LIN, Y.D.: Save all sentences for training
        Sec2Vec.__init__(self, sentences, corpus_file)

        # 20181126 Hannah Chen, modified variable: corpus_worker
        KeywordCorpusFactory.__init__(self, keywords, case_sensitive,
                                      corpus_worker)

        # 20181130 Hannah Chen
        self.kc = self.create(SentenceIterator(self.sentences),
                              corpus_chunksize)
        # 20181130 LIN, Y.D.: Save all sentences for training
        # self.kc = self.create(self.sentences, corpus_chunksize)
        # self.kc = self.create(sentences, corpus_chunksize)

        self.kv = dict(((keyword, []) for keyword in self.kc.keys()))

        self.keyword_count = dict(((keyword, 0) for keyword in self.kc.keys()))
        self.corpus_chunksize = corpus_chunksize

        FastText.__init__(self,
                          corpus_file=corpus_file,
                          size=size,
                          alpha=alpha,
                          word_ngrams=word_ngrams,
                          window=window,
                          min_count=min_count,
                          max_vocab_size=max_vocab_size,
                          sample=sample,
                          seed=seed,
                          workers=workers,
                          min_n=min_n,
                          max_n=max_n,
                          min_alpha=min_alpha,
                          sg=sg,
                          hs=hs,
                          bucket=bucket,
                          negative=negative,
                          ns_exponent=ns_exponent,
                          cbow_mean=cbow_mean,
                          iter=iter,
                          null_word=null_word,
                          trim_rule=trim_rule,
                          sorted_vocab=sorted_vocab,
                          batch_words=batch_words,
                          callbacks=[])