def gen_data(self): seq_batch = [] rand = random.randint(0, len(self.wordlists) - 1) seq = [] while ('' in self.wordlists[rand]): self.wordlists[rand].remove('') for word in self.wordlists[rand]: seq.append(W2V().str_to_vector(self.w2v_model, word)) print(len(W2V().str_to_vector(self.w2v_model, word))) seq_batch.append(seq) seq_batch = np.array(seq_batch) print("train shape:", seq_batch.shape) return seq_batch
def dump_clusters(): args = get_args() if args['-train'] == '': args['-train'] = 'src/resources/output' + args['-k'] w2vobj = W2V(args['-input'], args['-train'], args['-k']) news = News() articles = news.get_articles() w2vobj.train() # Sentence vectorization by averaging article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles] # Sentence vectorization by "newtonian" method '''article_vecs = [] for article in articles: newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title']) if newtonian_vec is not None: article_vecs.append(newtonian_vec)''' cluster_obj = Clustering(article_vecs, w2vobj) r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/")) if args['-cluster'] == 'agg': if args['-prune'] == 'true' or args['-prune'] == 'True': utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn) print("redis dump complete") else: utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn) print("redis dump complete") else: #TODO dump to redis utilties.print_ann_clusters(cluster_obj, articles)
def get_random_seq2(self): rand = random.randint(0, len(self.wordlists) - 2) seq_vec1 = [] seq_vec2 = [] seq1 = self.wordlists[rand] seq2 = self.wordlists[rand + 1] while ('' in seq1): seq1.remove('') while ('' in seq2): seq2.remove('') for word in seq1: seq_vec1.append(W2V().str_to_vector(self.w2v_model, word)) for word in seq2: seq_vec2.append(W2V().str_to_vector(self.w2v_model, word)) return seq1, seq_vec1, seq2, seq_vec2
def get_random_seq(self): rand = random.randint(0, len(self.wordlists) - 1) seq_vec = [] seq = self.wordlists[rand] while ('' in seq): seq.remove('') for word in seq: seq_vec.append(W2V().str_to_vector(self.w2v_model, word)) return seq, seq_vec
def make_data(X_pure_train, X_sentences_train, aspects_list_train, X_pure_test, X_sentences_test): sent_word = sentiRuLex() w2v = W2V() #w2v = None list_of_tfidf_train = compute_tfidf(X_sentences_train) list_of_tfidf_test = compute_tfidf(X_sentences_test) y_train = make_y_train(X_pure_train, aspects_list_train) x_train = make_x(X_pure_train, list_of_tfidf_train, sent_word, w2v, 'ресторан') x_test = make_x(X_pure_test, list_of_tfidf_test, sent_word, w2v, 'автомобиль') return x_train, y_train, x_test
class W2V: def __init__(self): self.initial = None def train_word2vec_stream(self, sentence_stream, num_epochs, num_features=10, min_word_count=5, context=10): model_dir = '/var/lib/arhuaco/data/models' model_name = "{:d}features_{:d}minwords_{:d}context"\ .format(num_features, min_word_count, context) model_name = join(model_dir, model_name) if exists(model_name): embedding_model = word2vec.Word2Vec.load(model_name) logging.info("Loading existing Word2Vec model \'%s\'" % split(model_name)[-1]) else: # Set values for various parameters num_workers = 2 # Number of threads to run in parallel downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model print('Initializing Word2Vec model...') sentences = next(sentence_stream) embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) for batch in range(num_epochs): sentences = next(sentence_stream) embedding_model.build_vocab(sentences, keep_raw_vocab=True, update=True) embedding_model.train( sentences, total_examples=embedding_model.corpus_count, epochs=embedding_model.epochs) logging.info("Finished epoch: %d" % batch) logging.info("Vocabulary length: %d" % len(embedding_model.wv.index2word)) if not exists(model_dir): os.mkdir(model_dir) logging.info('Saving Word2Vec model \'%s\'' % split(model_name)[-1]) embedding_model.save(model_name) # add unknown words embedding_list = [ embedding_model[w] for w in embedding_model.wv.index2word ] embedding_list.append( np.random.uniform(-0.25, 0.25, embedding_model.vector_size)) embedding_weights = [np.array(embedding_list)] return [ embedding_weights, embedding_model.wv.vocab, embedding_model.wv.index2word ] def load_word2vec_model(self, model_name): model_dir = '/var/lib/arhuaco/data/models' model_name = join(model_dir, model_name) logging.info("Loading w2v mode: %s" % model_name) if exists(model_name): embedding_model = word2vec.Word2Vec.load(model_name) logging.info("Loading existing Word2Vec model \'%s\'" % split(model_name)[-1]) # add unknown words embedding_list = [ embedding_model[w] for w in embedding_model.wv.index2word ] embedding_list.append( np.random.uniform(-0.25, 0.25, embedding_model.vector_size)) embedding_weights = [np.array(embedding_list)] return [ embedding_weights, embedding_model.wv.vocab, embedding_model.wv.index2word ] else: return None if __name__ == '__main__': from data_helpers import DataHelpers from w2v import W2V argv = sys.argv[1:] try: opts, args = getopt.getopt(argv, "ht:", ["type="]) except getopt.GetoptError: print("test_conv_w2v.py -t <type>") sys.exit(2) for opt, arg in opts: if opt == '-h': print("test_conv_w2v.py -t <type>") sys.exit() elif opt in ("-t", "--type"): type = arg if type == "syscall": # parameters max_length = 10 n_gram = 20 sequence_length = max_length * n_gram batch_size = 100000 paths = [ "/var/lib/arhuaco/data/normal_clean.csv", "/var/lib/arhuaco/data/malicious_clean.csv" ] labels = [0, 1] # Load data print("Loading data...") # Create objects data_helpers = DataHelpers(paths, labels, max_length, n_gram, batch_size) w2v = W2V() sentence_stream = data_helpers.sentence_stream(batch_size) w2v.train_word2vec_stream(sentence_stream, num_features=20, min_word_count=4, context=10, num_epochs=100) elif type == "network": # parameters max_length = 5 n_gram = 1 sequence_length = max_length * n_gram batch_size = 100 paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log" ] labels = [0, 1] # Load data print("Loading data...") # Create objects data_helpers = DataHelpers(paths, labels, max_length, n_gram, batch_size) w2v = W2V() sentence_stream = data_helpers.sentence_stream(batch_size) w2v.train_word2vec_stream(sentence_stream, num_features=5, min_word_count=1, context=4, num_epochs=25)
w2v_path = os.path.join(root, 'Data/model_weights/IMF_W2V/imf_160.w2v') #%% application = app = Flask( __name__) #3 for aws beanstock, we need to add application = api = Api(app) class Hello(Resource): def get(self): return jsonify({"data": "API is working"}) ## it need to be json serializable w2v = W2V(w2v_path) class Get_sim_words(Resource): def get(self, word, topn): if topn is None: topn = 10 res = w2v.get_most_similar(word, topn=topn) return jsonify({"data": { 'keyword': word, 'sim_words': res }}) ## it need to be json serializable ## register the api api.add_resource(Hello, "/api/") api.add_resource(Get_sim_words, "/api/getsim/<string:word>/<int:topn>")
def __init__(self): self.wordlists = self.get_word_lists("./aozora_text/files/tmp.txt") self.w2v_model = W2V.load_model()