Example #1
0
 def gen_data(self):
     seq_batch = []
     rand = random.randint(0, len(self.wordlists) - 1)
     seq = []
     while ('' in self.wordlists[rand]):
         self.wordlists[rand].remove('')
     for word in self.wordlists[rand]:
         seq.append(W2V().str_to_vector(self.w2v_model, word))
         print(len(W2V().str_to_vector(self.w2v_model, word)))
     seq_batch.append(seq)
     seq_batch = np.array(seq_batch)
     print("train shape:", seq_batch.shape)
     return seq_batch
Example #2
0
def dump_clusters():

    args = get_args()
    if args['-train'] == '':
        args['-train'] = 'src/resources/output' + args['-k']
    w2vobj = W2V(args['-input'], args['-train'], args['-k'])

    news = News()
    articles = news.get_articles()
    w2vobj.train()
    # Sentence vectorization by averaging
    article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles]

    # Sentence vectorization by "newtonian" method
    '''article_vecs = []
    for article in articles:
        newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title'])
        if newtonian_vec is not None:
            article_vecs.append(newtonian_vec)'''

    cluster_obj = Clustering(article_vecs, w2vobj)
    r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/"))

    if args['-cluster'] == 'agg':
        if args['-prune'] == 'true' or args['-prune'] == 'True':
            utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn)
            print("redis dump complete")
        else:
            utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn)
            print("redis dump complete")
    else:
        #TODO dump to redis
        utilties.print_ann_clusters(cluster_obj, articles)
Example #3
0
    def get_random_seq2(self):
        rand = random.randint(0, len(self.wordlists) - 2)
        seq_vec1 = []
        seq_vec2 = []
        seq1 = self.wordlists[rand]
        seq2 = self.wordlists[rand + 1]
        while ('' in seq1):
            seq1.remove('')
        while ('' in seq2):
            seq2.remove('')

        for word in seq1:
            seq_vec1.append(W2V().str_to_vector(self.w2v_model, word))
        for word in seq2:
            seq_vec2.append(W2V().str_to_vector(self.w2v_model, word))
        return seq1, seq_vec1, seq2, seq_vec2
Example #4
0
 def get_random_seq(self):
     rand = random.randint(0, len(self.wordlists) - 1)
     seq_vec = []
     seq = self.wordlists[rand]
     while ('' in seq):
         seq.remove('')
     for word in seq:
         seq_vec.append(W2V().str_to_vector(self.w2v_model, word))
     return seq, seq_vec
Example #5
0
def make_data(X_pure_train, X_sentences_train, aspects_list_train, X_pure_test,
              X_sentences_test):

    sent_word = sentiRuLex()

    w2v = W2V()
    #w2v = None

    list_of_tfidf_train = compute_tfidf(X_sentences_train)
    list_of_tfidf_test = compute_tfidf(X_sentences_test)

    y_train = make_y_train(X_pure_train, aspects_list_train)
    x_train = make_x(X_pure_train, list_of_tfidf_train, sent_word, w2v,
                     'ресторан')
    x_test = make_x(X_pure_test, list_of_tfidf_test, sent_word, w2v,
                    'автомобиль')

    return x_train, y_train, x_test
Example #6
0
class W2V:
    def __init__(self):
        self.initial = None

    def train_word2vec_stream(self,
                              sentence_stream,
                              num_epochs,
                              num_features=10,
                              min_word_count=5,
                              context=10):
        model_dir = '/var/lib/arhuaco/data/models'
        model_name = "{:d}features_{:d}minwords_{:d}context"\
                     .format(num_features, min_word_count, context)
        model_name = join(model_dir, model_name)

        if exists(model_name):
            embedding_model = word2vec.Word2Vec.load(model_name)
            logging.info("Loading existing Word2Vec model \'%s\'" %
                         split(model_name)[-1])
        else:
            # Set values for various parameters
            num_workers = 2  # Number of threads to run in parallel
            downsampling = 1e-3  # Downsample setting for frequent words
            # Initialize and train the model
            print('Initializing Word2Vec model...')
            sentences = next(sentence_stream)
            embedding_model = word2vec.Word2Vec(sentences,
                                                workers=num_workers,
                                                size=num_features,
                                                min_count=min_word_count,
                                                window=context,
                                                sample=downsampling)

            for batch in range(num_epochs):
                sentences = next(sentence_stream)
                embedding_model.build_vocab(sentences,
                                            keep_raw_vocab=True,
                                            update=True)
                embedding_model.train(
                    sentences,
                    total_examples=embedding_model.corpus_count,
                    epochs=embedding_model.epochs)
                logging.info("Finished epoch: %d" % batch)
                logging.info("Vocabulary length: %d" %
                             len(embedding_model.wv.index2word))
            if not exists(model_dir):
                os.mkdir(model_dir)
            logging.info('Saving Word2Vec model \'%s\'' %
                         split(model_name)[-1])
            embedding_model.save(model_name)
        # add unknown words
        embedding_list = [
            embedding_model[w] for w in embedding_model.wv.index2word
        ]
        embedding_list.append(
            np.random.uniform(-0.25, 0.25, embedding_model.vector_size))
        embedding_weights = [np.array(embedding_list)]

        return [
            embedding_weights, embedding_model.wv.vocab,
            embedding_model.wv.index2word
        ]

    def load_word2vec_model(self, model_name):
        model_dir = '/var/lib/arhuaco/data/models'
        model_name = join(model_dir, model_name)
        logging.info("Loading w2v mode: %s" % model_name)

        if exists(model_name):
            embedding_model = word2vec.Word2Vec.load(model_name)
            logging.info("Loading existing Word2Vec model \'%s\'" %
                         split(model_name)[-1])
            # add unknown words
            embedding_list = [
                embedding_model[w] for w in embedding_model.wv.index2word
            ]
            embedding_list.append(
                np.random.uniform(-0.25, 0.25, embedding_model.vector_size))
            embedding_weights = [np.array(embedding_list)]

            return [
                embedding_weights, embedding_model.wv.vocab,
                embedding_model.wv.index2word
            ]
        else:
            return None

    if __name__ == '__main__':
        from data_helpers import DataHelpers
        from w2v import W2V

        argv = sys.argv[1:]
        try:
            opts, args = getopt.getopt(argv, "ht:", ["type="])
        except getopt.GetoptError:
            print("test_conv_w2v.py -t <type>")
            sys.exit(2)
        for opt, arg in opts:
            if opt == '-h':
                print("test_conv_w2v.py -t <type>")
                sys.exit()
            elif opt in ("-t", "--type"):
                type = arg
        if type == "syscall":
            # parameters
            max_length = 10
            n_gram = 20
            sequence_length = max_length * n_gram
            batch_size = 100000
            paths = [
                "/var/lib/arhuaco/data/normal_clean.csv",
                "/var/lib/arhuaco/data/malicious_clean.csv"
            ]
            labels = [0, 1]
            # Load data
            print("Loading data...")
            # Create objects
            data_helpers = DataHelpers(paths, labels, max_length, n_gram,
                                       batch_size)
            w2v = W2V()
            sentence_stream = data_helpers.sentence_stream(batch_size)
            w2v.train_word2vec_stream(sentence_stream,
                                      num_features=20,
                                      min_word_count=4,
                                      context=10,
                                      num_epochs=100)
        elif type == "network":
            # parameters
            max_length = 5
            n_gram = 1
            sequence_length = max_length * n_gram
            batch_size = 100
            paths = [
                "/var/lib/arhuaco/data/dns_normal.log",
                "/var/lib/arhuaco/data/dns_malicious.log"
            ]
            labels = [0, 1]
            # Load data
            print("Loading data...")
            # Create objects
            data_helpers = DataHelpers(paths, labels, max_length, n_gram,
                                       batch_size)
            w2v = W2V()
            sentence_stream = data_helpers.sentence_stream(batch_size)
            w2v.train_word2vec_stream(sentence_stream,
                                      num_features=5,
                                      min_word_count=1,
                                      context=4,
                                      num_epochs=25)
Example #7
0
w2v_path = os.path.join(root, 'Data/model_weights/IMF_W2V/imf_160.w2v')
#%%

application = app = Flask(
    __name__)  #3 for aws beanstock, we need to add application =
api = Api(app)


class Hello(Resource):
    def get(self):
        return jsonify({"data":
                        "API is working"})  ## it need to be json serializable


w2v = W2V(w2v_path)


class Get_sim_words(Resource):
    def get(self, word, topn):
        if topn is None: topn = 10
        res = w2v.get_most_similar(word, topn=topn)
        return jsonify({"data": {
            'keyword': word,
            'sim_words': res
        }})  ## it need to be json serializable


## register the api
api.add_resource(Hello, "/api/")
api.add_resource(Get_sim_words, "/api/getsim/<string:word>/<int:topn>")
Example #8
0
 def __init__(self):
     self.wordlists = self.get_word_lists("./aozora_text/files/tmp.txt")
     self.w2v_model = W2V.load_model()