def main(): ################################################################################### ######################################load dataset################################# train_set = '/root/Text_Mining_Project/src/data/train.tsv' #load dataset # save model to file model = 'sswe_model' # model save file # vocab file sswe_vocabs = '/root/Text_Mining_Project/src/words.txt' # vocab file # vectors file sswe_vectors = '/root/Text_Mining_Project/src/vectors.txt' # vectors file model=model,train=train_set,vocab=vocab,vectors=vectors # sswe model #sswe=sswe_model(train=train_set,vocab=sswe_vocabs,vectors=sswe_vectors) #train sswe model #sswe_trainer(sswe) # load tweets reader = TweetReader(text_field=2, label_field=1, ngrams=2) reader.read(train_set) # load vocab base_vocab = reader.load_vocabulary(sswe_vocabs) # load embedding #vocab_file=None, vectors=None, vocab=None,variant=None #emb = Embeddings() #print emb.load_vocabulary(sswe_vocabs) embeddings = Embeddings(vocab_file=sswe_vocabs, vectors=sswe_vectors, vocab=base_vocab, variant=None) # define converter converter = Converter() # add embeding to converter converter.add(embeddings) print reader.sentences # generate feature vectors for tweets converted_tweets = converter.generator(reader.sentences, cache=True) #for s in converted_tweets: # print s.shape #pprint(converted_sentences) #for k in converted_tweets: # print k.shape #sent= converter.generator([['I','am','happy']],cache=True) #for k in sent: # print k return 0
def sswe_trainer(model_parameters): # set the seed for replicability np.random.seed(42) # args = parser.parse_args() args = model_parameters log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO log_level = logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams) reader.read(args.train) vocab, bigrams, trigrams = reader.create_vocabulary( reader.sentences, args.vocab_size, min_occurrences=args.minOccurr) #print("length vocab") #print(len(vocab)) if args.variant == 'word2vec' and os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) embeddings.merge(vocab) logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif os.path.exists(args.vocab): # start with the given vocabulary b_vocab = reader.load_vocabulary(args.vocab) bound = len(b_vocab) - len(bigrams) - len(trigrams) base_vocab = b_vocab[:bound] #print("length base vocab :") #print(len(base_vocab)) if os.path.exists(args.vectors): # load embeddings embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: # create embeddings embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus embeddings.merge(vocab) logger.info("Overriding vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) else: embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) # Assume bigrams are prefix of trigrams, or else we should put a terminator # on trie trie = {} for b in chain(bigrams, trigrams): tmp = trie for w in b: tmp = tmp.setdefault(embeddings.dict[w], {}) converter = Converter() converter.add(embeddings) trainer = create_trainer(args, converter) report_intervals = max(args.iterations / 200, 1) report_intervals = 10000 # DEBUG logger.info("Starting training") # a generator expression (can be iterated several times) # It caches converted sentences, avoiding repeated conversions converted_sentences = converter.generator(reader.sentences, cache=True) trainer.train(converted_sentences, reader.polarities, trie, args.iterations, report_intervals) logger.info("Overriding vectors to %s" % args.vectors) embeddings.save_vectors(args.vectors, args.variant) if args.model: logger.info("Saving trained model to %s" % args.model) trainer.save(args.model)
action='store_true') args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams) reader.read(args.train) vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences, args.vocab_size, min_occurrences=args.minOccurr) if args.variant == 'word2vec' and os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, variant=args.variant) embeddings.merge(vocab) logger.info("Saving vocabulary in %s" % args.vocab) embeddings.save_vocabulary(args.vocab) elif os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if os.path.exists(args.vectors): # load embeddings embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab,
action='store_true') args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(args.ngrams) reader.read(args.train) loaded_vocab = False if args.vocab and os.path.exists(args.vocab): loaded_vocab = True vocab = reader.load_vocabulary(args.vocab) else: vocab = reader.create_vocabulary(reader.sentences) tokens = [] for l in vocab: tokens.extend(l) # flatten ngrams dictionaries embeddings = Embeddings(args.embeddings_size, vocab=tokens, variant=args.variant) converter = Converter() converter.add_extractor(embeddings)
action='store_true') args = parser.parse_args() log_format = '%(message)s' log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(format=log_format, level=log_level) logger = logging.getLogger("Logger") config = ConfigParser() if args.config_file: config.read(args.config_file) # merge args with config reader = TweetReader(args.ngrams) reader.read(args.train) vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences, min_occurrences=2) if os.path.exists(args.vocab): # start with the given vocabulary base_vocab = reader.load_vocabulary(args.vocab) if os.path.exists(args.vectors): embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) else: embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) # add the ngrams from the corpus