Beispiel #1
0
	def initModel(self):
		path = self.getModelFilePath()
		modelFull = self.config.getBooleanConfig("common.model.full")[0]
		if modelFull:
			if self.model is None:
				self.model = Word2Vec.load(path)
			self.wv = self.model.wv
		else:
			if self.wv is None:
				self.wv = KeyedVectors.load(path, mmap='r')
    def __init__(self):
       self.cmdpairs = {
           "!similar": self.execute_cnb,
           "!similaryle": self.execute_yle,

           "!similarn": self.execute_n_cnb,
           "!similarnyle": self.execute_n_yle,

           "!similarnr": self.execute_n_cnb_r,
           "!similarnyler": self.execute_n_yle_r,

           "!xminusyplusz": self.execute_xyz_cnb,
           "!xminusypluszyle": self.execute_xyz_yle,
#            "!xminusyplusz": self.execute_x_minus_y_plus_z
       }
       self.cnb_wv = gensim.models.Word2Vec.load("./Resources/word2vec_2014-2019_04.model").wv
       self.yle_wv = KeyedVectors.load("./Resources/word2vec_yle_dersb")
Beispiel #3
0
clusters = [0] * 10
i = 0
while i < 10:
    RandGenerated = random.randint(1, len(data) - 2)
    if RandGenerated in clusters:
        i = i - 1
    else:
        clusters[i] = RandGenerated
    i = i + 1
clusters = sorted(clusters)
print(clusters)
clusters = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

# In[3]:

model = KeyedVectors.load('newmodel')
# model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
# model.save('newmodel')

# In[4]:


def countDistance(sentenceConst, WholeGroup):
    distance = [0] * len(WholeGroup)
    sentenceFirstString = str(sentenceConst)
    sentenceFirstString = sentenceFirstString.lower().split()
    sentenceFirstString = [
        w for w in sentenceFirstString if w not in stop_words
    ]
    i = 0
    for sentence in WholeGroup:
from rake_nltk import Rake
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
from math import log, floor
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import textrank
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


model = KeyedVectors.load("models/normalized.model")
stop_words = set(stopwords.words('english'))

def equals(prediction, expected):
    if(prediction == expected):
        return 1
    else:
        return 0

def get_prediction(scores):
    total = sum(scores.values())

    if(total != 0):
        for score in scores:
            scores[score] = scores[score]/total
Beispiel #5
0
my_df = pd.read_csv("clean_tweet.csv", index_col=0)
my_df.dropna(inplace=True)
my_df.reset_index(drop=True, inplace=True)

x = my_df.text
y = my_df.target

SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02,
                                                                                  random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test,
                                                              test_size=.5, random_state=SEED)


model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')

embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_ug_cbow.wv[w], model_ug_sg.wv[w])

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    print("Saved")

sequences = tokenizer.texts_to_sequences(x_train)
Beispiel #6
0
### Setup

os.chdir('/Users/chrki23/Documents/Insight_Project')
path = os.getcwd()

### File Loading

lookup_path = path + '/data/cleaned/lookup_table.data'
fileloader = open(lookup_path, 'rb')
lookup_table = pickle.load(fileloader)
fileloader.close()

vector_path = path + '/data/cleaned/final_vectors.kv'
fname = get_tmpfile(vector_path)
word_vectors = KeyedVectors.load(fname, mmap="r")

ingredients_path = path + '/data/cleaned/ingredients_used.data'
fileloader = open(ingredients_path, 'rb')
ingredients_used = pickle.load(fileloader)
fileloader.close()

### Helper functions


def display_choice(user_input):
    missing_product = lookup_table[(lookup_table.name.isin(user_input))]
    return missing_product.iloc[:, 0:4]


def get_ingredient(user_input):
def createDatasetSplit(params):
    filename = set_name(params)
    if path.exists(filename):
        ##### REMOVE LATER ######
        #dataset=collect_data(params)
        pass
    else:
        dataset = collect_data(params)

    if (path.exists(filename[:-7])):
        with open(filename[:-7] + '/train_data.pickle', 'rb') as f:
            X_train = pickle.load(f)
        with open(filename[:-7] + '/val_data.pickle', 'rb') as f:
            X_val = pickle.load(f)
        with open(filename[:-7] + '/test_data.pickle', 'rb') as f:
            X_test = pickle.load(f)
        if (params['bert_tokens'] == False):
            with open(filename[:-7] + '/vocab_own.pickle', 'rb') as f:
                vocab_own = pickle.load(f)

    else:
        if (params['bert_tokens'] == False):
            word2vecmodel1 = KeyedVectors.load("Data/word2vec.model")
            vector = word2vecmodel1['easy']
            assert (len(vector) == 300)

        dataset = pd.read_pickle(filename)
        #X_train_dev, X_test= train_test_split(dataset, test_size=0.1, random_state=1,stratify=dataset['Label'])
        #X_train, X_val= train_test_split(X_train_dev, test_size=0.11, random_state=1,stratify=X_train_dev['Label'])
        with open('Data/post_id_divisions.json', 'r') as fp:
            post_id_dict = json.load(fp)

        X_train = dataset[dataset['Post_id'].isin(post_id_dict['train'])]
        X_val = dataset[dataset['Post_id'].isin(post_id_dict['val'])]
        X_test = dataset[dataset['Post_id'].isin(post_id_dict['test'])]

        if (params['bert_tokens']):
            vocab_own = None
            vocab_size = 0
            padding_idx = 0
        else:
            vocab_own = Vocab_own(X_train, word2vecmodel1)
            vocab_own.create_vocab()
            padding_idx = vocab_own.stoi['<pad>']
            vocab_size = len(vocab_own.vocab)

        X_train = encodeData(X_train, vocab_own, params)
        X_val = encodeData(X_val, vocab_own, params)
        X_test = encodeData(X_test, vocab_own, params)

        print("total dataset size:", len(X_train) + len(X_val) + len(X_test))

        os.mkdir(filename[:-7])
        with open(filename[:-7] + '/train_data.pickle', 'wb') as f:
            pickle.dump(X_train, f)

        with open(filename[:-7] + '/val_data.pickle', 'wb') as f:
            pickle.dump(X_val, f)
        with open(filename[:-7] + '/test_data.pickle', 'wb') as f:
            pickle.dump(X_test, f)
        if (params['bert_tokens'] == False):
            with open(filename[:-7] + '/vocab_own.pickle', 'wb') as f:
                pickle.dump(vocab_own, f)

    if (params['bert_tokens'] == False):
        return X_train, X_val, X_test, vocab_own
    else:
        return X_train, X_val, X_test
Beispiel #8
0
movie_popularities = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_popularities.p", "rb" ))
book_popularities = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_popularities.p", "rb" ))
common_tropes = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/common_tropes.p", "rb" ))
col_to_trope_list = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/col_to_trope_list.p", "rb" ))
movie_titles = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_titles.p", "rb" ))
book_titles = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_titles.p", "rb" ))

with open('./app/irsystem/controllers/SPARSE OR NECESSARY/book_word_to_trope.json', 'r') as f: 
    book_word_to_trope = json.load(f)
with open('./app/irsystem/controllers/SPARSE OR NECESSARY/movie_word_to_trope.json', 'r') as f: 
    movie_word_to_trope = json.load(f)
book_to_movie_vectorizer = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_to_movie_vectorizer.pickle", "rb" ))
movie_to_book_vectorizer = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_to_book_vectorizer.pickle", "rb" ))
movie_tf_idf = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_tf_idf.pickle", "rb" ))
book_tf_idf = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_tf_idf.pickle", "rb" ))
model = KeyedVectors.load("./app/irsystem/controllers/SPARSE OR NECESSARY/tbwb_model.bin")


def get_closest_tropes_to_keyword(keyword, word_to_trope, model, top_k = 5): 
    if keyword in model.vocab: 
        all_words = list(word_to_trope.keys())
        all_words = [word for word in all_words if word in model.vocab]
        dists = model.distances(keyword, all_words)
        sorted_indices = np.argsort(dists)
        sorted_keyword_match = [all_words[idx] for idx in sorted_indices[:top_k]]
        for word in sorted_keyword_match: 
            trope_matches = list(itertools.chain.from_iterable([word_to_trope[word] for word in sorted_keyword_match if word in word_to_trope]))
        return trope_matches[:top_k]
    else: 
        print('`{}` not in model vocabulary, cannot enhance search with keyword'.format(keyword))
        return []
import json
import requests
import bs4
import numpy as np
import time
from gensim.models import KeyedVectors
import multiprocessing


#Program extracts information from CLEF's JSON at https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-train-multilingual.json
#Do note that this program takes a while to complete.

modelPath="/alignedEnVecs"#Path to file produced by utils.vec2File()
model = KeyedVectors.load(modelPath,mmap="r")#Input file path to appropriate model
stopWords=[line.split()[0] for line in open("SMART Stopwords.txt","r")]

#Various statistics that may be useful
df={}#Document term frequency
ctf={}#Collection term frequency
mtf={}#Mean term frequency (Like collection term frequency but normalized for document length)
totalDocCount=0#Total number of documents


def vectorize(string,model):#Function calculates the tf-idf averaged vectors of whatever string is passed using whatever mdoel is passed
    terms = string.lower().split()
    vector = np.zeros((1,300))
    totalTfIdf=0
    for term in terms:
        if term not in stopWords:
            try:
                tfidf=string.count(term)*np.log10(totalDocCount/df[term])
Beispiel #10
0
    def training(self, config):
        DATASET_PATH = config['file']['input']['trainingtweets']
        df = pd.read_csv(DATASET_PATH)
        df = df[["choose_one", "text", "choose_one:confidence"]]
        #df = df[df['choose_one:confidence'] == 1.0]
        print("There are %d items in df" % len(df))
        df['target'] = df.choose_one.map({'Relevant': 1, 'Not Relevant': 0})
        df = df[df.target.isnull() == False]
        df = df.drop_duplicates(subset=["text"]).reset_index()
        print("Total unique tweets:%d" % len(df))
        tokenizer_it = twitter_parser.Tokenizer()
        df['tokenized'] = df["text"].apply(tokenizer_it.tweet_to_tokens)
        list_tokenized_tweets = []
        for index, row in df.iterrows():
            temp = row['tokenized']
            str = ' '.join(temp)
            list_tokenized_tweets.append(str)
        new_column = pd.Series(list_tokenized_tweets)
        df['tokenized_text'] = new_column.values
        print(df.columns)
        print("Statistic to check skew-data:")
        print("On-topic Tweets: %d" % len(df[df['target'] == 1]))
        print("Off-topic Tweets:%d" % len(df[df['target'] == 0]))
        df.drop('index', axis=1, inplace=True)
        print(df.head())
        df.info()

        x = df.tokenized_text
        y = df.target
        print(y)

        from sklearn.cross_validation import train_test_split
        SEED = 2000
        x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=0.2,
                                                                                          random_state=SEED)
        x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test,
                                                                      test_size=0.5, random_state=SEED)
        print("Train set has total %d entries" % len(x_train))
        print("Train set has total %.2f percent Relevant tweets" % (len(x_train[y == 1.]) / len(x_train)))
        print("Train set has total %.2f percent Not Relevant tweets" % (len(x_train[y == 0.]) / len(x_train)))
        print("--------------------------------------------")
        print("Validation set has total %d entries" % len(x_validation))
        print(
            "Validation set has total %.2f percent Relevant tweets" % (len(x_validation[y == 1.]) / len(x_validation)))
        print("Validation set has total %.2f percent Not Relevant tweets" % (
                    len(x_validation[y == 0.]) / len(x_validation)))
        print("--------------------------------------------")
        print("Test set has total %d entries" % len(x_test))
        print("Test set has total %.2f percent Relevant tweets" % (len(x_test[y == 1.0]) / len(x_test)))
        print("Test set has total %.2f percent Not Relevant tweets" % (len(x_test[y == 0.]) / len(x_test)))
        print("--------------------------------------------")

        from tqdm import tqdm
        tqdm.pandas(desc="progress-bar")
        import gensim
        from gensim.models.word2vec import Word2Vec
        from gensim.models.doc2vec import TaggedDocument
        import multiprocessing
        from sklearn import utils

        def labelize_tweets_ug(tweets, label):
            result = []
            prefix = label
            for i, t in zip(tweets.index, tweets):
                result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
            return result

        all_x = pd.concat([x_train, x_validation, x_test])
        all_x_w2v = labelize_tweets_ug(all_x, 'all')

        cores = multiprocessing.cpu_count()
        model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065,
                                 min_alpha=0.065)
        model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

        for epoch in range(30):
            model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v),
                                epochs=1)
            model_ug_cbow.alpha -= 0.002
            model_ug_cbow.min_alpha = model_ug_cbow.alpha

        model_ug_sg = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065,
                               min_alpha=0.065)
        model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

        for epoch in range(30):
            model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v),
                              epochs=1)
            model_ug_sg.alpha -= 0.002
            model_ug_sg.min_alpha = model_ug_sg.alpha

        model_ug_cbow.save('../models/CrowFlowd/w2v_model_ug_cbow.word2vec')
        model_ug_sg.save('../models/CrowFlowd/w2v_model_ug_sg.word2vec')

        from gensim.models import KeyedVectors
        model_ug_cbow = KeyedVectors.load('../models/CrowFlowd/w2v_model_ug_cbow.word2vec')
        model_ug_sg = KeyedVectors.load('../models/CrowFlowd/w2v_model_ug_sg.word2vec')
        print(len(model_ug_cbow.wv.vocab.keys()))

        embeddings_index = {}

        import numpy as np
        for w in model_ug_cbow.wv.vocab.keys():
            embeddings_index[w] = np.append(model_ug_cbow.wv[w], model_ug_sg.wv[w])
        print('Found %s word vectors.' % len(embeddings_index))

        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        import pickle

        tokenizer = Tokenizer(num_words=100000)
        tokenizer.fit_on_texts(x_train)
        sequences = tokenizer.texts_to_sequences(x_train)

        print(len(tokenizer.word_index))

        self._tokenizer = tokenizer
        with open('../models/CrowFlowd/tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

        length = []
        for x in x_train:
            length.append(len(x.split()))
        print(max(length))

        x_train_seq = pad_sequences(sequences, maxlen=45)
        print('Shape of data tensor:', x_train_seq.shape)

        sequences_val = tokenizer.texts_to_sequences(x_validation)
        x_val_seq = pad_sequences(sequences_val, maxlen=45)

        sequences_test = tokenizer.texts_to_sequences(x_test)
        x_test_seq = pad_sequences(sequences_test, maxlen=45)
        with open('../models/CrowFlowd/x_test_seq.obj', 'wb') as handle:
            pickle.dump(x_test_seq, handle, protocol=pickle.HIGHEST_PROTOCOL)

        with open('../models/CrowFlowd/y_test.obj', 'wb') as handle:
            pickle.dump(y_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

        num_words = 100000
        embedding_matrix = np.zeros((num_words, 200))
        for word, i in tokenizer.word_index.items():
            if i >= num_words:
                continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        seed = 7
        from keras.models import Sequential
        from keras.layers import Dense, Dropout
        from keras.layers import Flatten
        from keras.layers.embeddings import Embedding

        from keras import backend as K
        def f1(y_true, y_pred):
            def recall(y_true, y_pred):
                true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
                possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
                recall = true_positives / (possible_positives + K.epsilon())
                return recall

            def precision(y_true, y_pred):
                true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
                predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
                precision = true_positives / (predicted_positives + K.epsilon())
                return precision

            precision = precision(y_true, y_pred)
            recall = recall(y_true, y_pred)
            return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

        from keras.layers import Input, Dense, concatenate, Activation, Conv1D, GlobalMaxPooling1D
        from keras.models import Model

        tweet_input = Input(shape=(45,), dtype='int32')
        tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=True)(tweet_input)
        bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
        bigram_branch = GlobalMaxPooling1D()(bigram_branch)
        trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(
            tweet_encoder)
        trigram_branch = GlobalMaxPooling1D()(trigram_branch)
        fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(
            tweet_encoder)
        fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
        merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

        merged = Dense(256, activation='relu')(merged)
        merged = Dropout(0.2)(merged)
        merged = Dense(1)(merged)
        output = Activation('sigmoid')(merged)
        model = Model(inputs=[tweet_input], outputs=[output])
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        model.summary()

        from keras.callbacks import ModelCheckpoint

        import numpy as np
        from keras.callbacks import Callback
        from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
        class Metrics(Callback):
            def on_train_begin(self, logs={}):
                self.val_f1s = []
                self.val_recalls = []
                self.val_precisions = []

            def on_epoch_end(self, epoch, logs={}):
                val_predict = (np.asarray(self.model.predict(x_val_seq))).round()
                val_targ = y_validation
                _val_f1 = f1_score(val_targ, val_predict)
                _val_recall = recall_score(val_targ, val_predict)
                _val_precision = precision_score(val_targ, val_predict)
                self.val_f1s.append(_val_f1)
                self.val_recalls.append(_val_recall)
                self.val_precisions.append(_val_precision)
                print(" — val_f1: % f — val_precision: % f — val_recall % f" % (_val_f1, _val_precision, _val_recall))
                return

        metric = Metrics()

        filepath = "../models/Crowflowd/CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
        checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        model.fit(x_train_seq, y_train, batch_size=64, epochs=5, validation_data=(x_val_seq, y_validation),
                  callbacks=[metric, checkpoint])
        self._model = model
        scores = model.evaluate(x_test_seq, y_test, verbose=1)
        print("Accuracy: %.2f%%" % (scores[1] * 100))
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(
        description="Train the FAN or the HAN model"
    )
    parser.add_argument(
        "dataset",
        choices=["yelp", "yahoo", "amazon", "synthetic"],
        help="Choose the dataset",
    )
    parser.add_argument(
        "model",
        choices=["fan", "han"],
        help="Choose the model to be trained (flat or hierarchical)",
    )

    args = parser.parse_args()

    if args.dataset == "yelp":
        dataset_config = Yelp
    elif args.dataset == "yahoo":
        dataset_config = Yahoo
    elif args.dataset == "amazon":
        dataset_config = Amazon
    elif args.dataset == "synthetic":
        dataset_config = Synthetic
    else:
        # should not end there
        exit()

    wv = KeyedVectors.load(dataset_config.EMBEDDING_FILE)

    train_df = pd.read_csv(dataset_config.TRAIN_DATASET).fillna("")
    train_documents = train_df.text
    train_labels = train_df.label
    if args.model == "fan":
        train_dataset = FlatDataset(
            train_documents,
            train_labels,
            wv.vocab,
            dataset_config.WORDS_PER_DOC[PADDING],
        )
    else:
        train_dataset = HierarchicalDataset(
            train_documents,
            train_labels,
            wv.vocab,
            dataset_config.SENT_PER_DOC[PADDING],
            dataset_config.WORDS_PER_SENT[PADDING],
        )
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2,
    )

    val_df = pd.read_csv(dataset_config.VAL_DATASET).fillna("")
    val_documents = val_df.text
    val_labels = val_df.label
    if args.model == "fan":
        val_dataset = FlatDataset(
            val_documents,
            val_labels,
            wv.vocab,
            dataset_config.WORDS_PER_DOC[PADDING],
        )
    else:
        val_dataset = HierarchicalDataset(
            val_documents,
            val_labels,
            wv.vocab,
            dataset_config.SENT_PER_DOC[PADDING],
            dataset_config.WORDS_PER_SENT[PADDING],
        )
    val_data_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=BATCH_SIZE, shuffle=True
    )

    logdir = Path(f"{LOG_DIR}/{args.dataset}/{args.model}")
    logdir.mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(str(logdir / f"{PADDING}pad"))

    if args.model == "fan":
        model = Fan(
            embedding_matrix=wv.vectors,
            word_hidden_size=WORD_HIDDEN_SIZE,
            num_classes=len(train_labels.unique()),
            batch_size=BATCH_SIZE,
        ).to(DEVICE)
    else:
        model = Han(
            embedding_matrix=wv.vectors,
            word_hidden_size=WORD_HIDDEN_SIZE,
            sent_hidden_size=SENT_HIDDEN_SIZE,
            num_classes=len(train_labels.unique()),
            batch_size=BATCH_SIZE,
        ).to(DEVICE)

    criterion = torch.nn.NLLLoss().to(DEVICE)
    optimizer = torch.optim.SGD(
        (p for p in model.parameters() if p.requires_grad),
        lr=LEARNING_RATE,
        momentum=MOMENTUM,
    )
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.1, patience=PATIENCE - 2, verbose=True,
    )

    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    best_val_loss = 1_000_000
    best_state_dict = model.state_dict()
    actual_patience = 0
    for epoch in range(1, EPOCHS + 1):
        train_loss, train_acc = train_func(
            model, train_data_loader, criterion, optimizer, writer
        )
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        val_loss, val_acc = test_func(model, val_data_loader, criterion)
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        print(f"Epoch {epoch}")
        print(
            f"  Train loss: {train_loss:.4}, Train acc: {train_acc * 100:.1f}%"
        )
        print(f"  Val loss: {val_loss:.4}, Val acc: {val_acc * 100:.1f}%")

        lr_scheduler.step(val_loss)

        writer.add_scalar("Train/Loss", train_loss, epoch)
        writer.add_scalar("Train/Accuracy", train_acc, epoch)
        writer.add_scalar("Validation/Loss", val_loss, epoch)
        writer.add_scalar("Validation/Accuracy", val_acc, epoch)
        writer.add_scalar(
            "Learning rate", optimizer.param_groups[0]["lr"], epoch
        )

        # Early stopping with patience
        if val_loss < best_val_loss:
            actual_patience = 0
            best_val_loss = val_loss
            best_state_dict = model.state_dict()
        else:
            actual_patience += 1
            if actual_patience == PATIENCE:
                model.load_state_dict(best_state_dict)
                break

    writer.add_text(
        "Hyperparameters",
        f"BATCH_SIZE = {BATCH_SIZE}; "
        f"MOMENTUM = {MOMENTUM}; "
        f"PATIENCE = {PATIENCE}; "
        f"PADDING = {PADDING}",
    )
    writer.close()

    modeldir = Path(MODEL_DIR)
    modeldir.mkdir(parents=True, exist_ok=True)
    torch.save(
        model.state_dict(),
        f"{modeldir}/{args.dataset}-{args.model}-{PADDING}pad.pth",
    )
Beispiel #12
0
from gensim.models import KeyedVectors
import time, timeit
import sys
import io
import pdb

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

pdb.set_trace()
t0 = time.clock()
#wv_from_text = KeyedVectors.load_word2vec_format('/Users/jagenzhao/dataprocess/word2vec/Tencent_AILab_ChineseEmbedding.txt')
#wv_from_text.init_sims(replace=True)
#wv_from_text.save('/Users/Jagen/Downloads/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.models')
model = KeyedVectors.load(
    '/Users/jagenzhao/dataprocess/word2vec/Tencent_AILab_ChineseEmbedding.models',
    mmap='r')
print('load models ok.', time.clock() - t0, 's\n')

t0 = time.clock()
print(u'白发')
keys = model.similar_by_word('白发', topn=100)
for key in keys:
    print(key)
print('cost %f' % (time.clock() - t0))

t0 = time.clock()
print('\n')
print('斑丘疹')
for key in model.similar_by_word('斑丘疹', topn=100):
    print(key)
Beispiel #13
0
        # window = 2,
        # min_count=50,
        # workers=4,
        # iter=100,
        sg=1  # skip-gram
    )

    return model


if __name__ == '__main__':
    saved_model_name = 'model.wv'

    # Load a model
    try:
        model = KeyedVectors.load(saved_model_name, mmap='r')
    except (FileNotFoundError):
        # input_data = read_data_files('./data/aclImdb/train/pos')
        # input_data = word2vec.Text8Corpus(datapath('./data/aclImdb/train/pos'))
        sentences = word2vec.PathLineSentences(
            # datapath(
            #     os.path.expanduser(
            #         os.path.join('data', 'aclImdb', 'train', 'pos')
            #     )
            # )
            'C:\\Users\\jinai\\git_projects\\D-RNN\\experimental\\sejin\\word2vec_test\\data\\aclImdb\\test\\pos'
        )
        model = make_word2vec_model(list(sentences))
        model.save(saved_model_name)

    # Remove unnecessary data from the memory
Beispiel #14
0

# create data source plain txt lib
def create_java_source_code_data():
    with open(code_dot_data_cfg_generated, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        next(csv_reader)  # skip header
        for item in csv_reader:
            plant_source_code_text = read_source_code(item[target_java_file])
            write2file(save_source_code_file, plant_source_code_text.encode("utf-8", 'ignore'))


# create_java_source_code_data()
doc2vec_cfg = Doc2Vec.load(CFG_EMBEDDING_FILE)

word2vec = KeyedVectors.load(SOURCE_CODE_EMBEDDING_FILE)
# with open(code_dot_data_cfg_generated, mode='r') as csv_file:
#     csv_reader = csv.DictReader(csv_file)
#     next(csv_reader)  # skip header
#     for item in csv_reader:
#         dot_list = item[dot_file_list].split('|')
#         dotfiledict = {}
#         for dfname in dot_list:
#             dot_fname = find_dotf_name(dfname).replace('(', '')
#             dotfiledict[dot_fname] = dfname
#         row = (
#             item['file_name'],
#             item['target_java_named_folder'],
#             item['target_java_named_folder'].replace('bcb_reduced_java_named_files', 'bcb_reduced') + '.java',
#             dotfiledict)
#         dotfile_data_dict[item['file_name']] = row
Beispiel #15
0
import torch

from model2 import PreTrainedEmbeddingEncoderRNN, AttnDecoderRNN
from preprocess import prepareData
from train_bidirectional import Trainer
from preprocess import buildPairs
from gensim.models import KeyedVectors
import params

use_cuda = torch.cuda.is_available()

trainfile = '/home/prosa/Works/Text/mt/dataset/filter-en-id/lenlim80/sorted/train.dummy'
src_lang, tgt_lang, pairs = prepareData(trainfile, reverse=False)

# Word vector
word_vectors = KeyedVectors.load(params.WORD_VECTORS_FILE)

hidden_size = word_vectors.vector_size
max_len = 8
encoder = PreTrainedEmbeddingEncoderRNN(word_vectors, max_len)
attn_decoder = AttnDecoderRNN(hidden_size,
                              tgt_lang,
                              dropout_p=0.1,
                              max_length=max_len)

if use_cuda:
    encoder = encoder.cuda()
    attn_decoder = attn_decoder.cuda()

epoch = 100
num_iter = len(pairs)
Beispiel #16
0
    def inference(self, model_input):
        """
        Internal inference methods
        :param model_input: transformed model input data list
        :return: list of inference output in NDArray
        """
        stoplist = list(string.punctuation)
        language = self.model_params['language']
        if language == "de":
            language = "german"
        stoplist += stopwords.words(language)
        stoplist += ['archiviert', 'archiviertes', 'angeblich', 'angebliche', 'facebook', 'seien', 'sei', 'facebookpost', 'behauptung', 'sozialen', 'netzwerken', 'heißt', 'verbreitet', 'mögliche', 'höher', 'wort', 'teils', 'kaum', 'lassen', 'ersten', 'heraus', 'vergleich', 'simpsons', 'behauptet', 'etwa', 'worden', 'immer', 'post', 'sehen', 'kursiert', 'geteilt', 'hätten', 'sollen', 'zeigen', 'derzeit', 'seit', 'wurde', 'schon', 'mehr', 'zwei', 'gibt', 'dabei', 'steht', 'zeigt', 'sic', 'wegen', 'viele', 'netz', 'posting', 'video', 'gesagt', 'internet', 'artikel', 'nutzer', 'jahr', 'beitrag', 'macht', 'sharepic', 'gebe', 'zusammenhang', 'dafür', 'text', 'ab', 'jahren', 'kursieren', 'mann', 'frau', 'überschrift', 'laut', 'seite', 'de', 'zeige', 'wer', 'demnach', 'ende', 'prozent', 'wurden', 'mehrere', 'zudem', 'darin', 'suggeriert', 'zahlen', 'beleg', 'millionen', 'denen', 'beim', 'müssen', 'bereits', 'drei', 'darauf', 'online', 'jahre', 'geht', 'august', 'mehreren', 'beispiel', 'bekommen', 'welt', 'behauptungen', 'neue', 'land', 'stadt', 'oktober', 'erklärt', 'gefährlich', 'sogar', 'belegen', 'gar', 'heute', 'webseite', 'könne', 'schreibt', 'angebliches', 'mal', 'aktuell', 'angeblichen', 'behaupten', 'eindämmung', 'zufolge','jedoch', 'aussage', 'zugeschrieben', 'geld', 'eindruck', 'positiv', 'daten','zahl', 'berichtet', 'märz', 'davon', 'november', 'neben', 'bestätigt', 'leben', 'weniger', 'http', 'neuen', 'schutz', 'aktuellen', 'gab', 'halten', 'oft', 'vermeintliche', 'ganz', 'anfang', 'tag', 'aussagen', 'könnten', 'darunter', 'dezember', 'grund', 'erhalten', 'kommt', 'logo', 'unterstellt', 'erweckt', 'erst', 'wochen', 'gegeben', 'daher', 'zeit', 'gut', 'tage', 'sowie', 'rund', 'gestellt', 'screenshot', 'mitarbeiter', 'user', 'zweiten', 'april', 'geben', 'grafik', 'videos', 'fordert', 'häufig', 'außerdem','lautet', 'beiträgen', 'vermeintlichen', 'finden', 'gemacht', 'stellt', 'posts', 'personen', 'berichten', 'angegeben', 'verbreiten', 'arzt', 'präsident', 'bevölkerung', 'infektion', 'com', 'ländern', 'präsidenten', 'krise', 'bürger', 'rede', 'berichten', 'angegeben', 'verbreiten', 'fall', 'dpaq', 'runde', 'soziale', 'gebracht', 'worte', 'quelle', 'bringen', 'lesen', 'lange', 'tatsächlich', 'erneut', 'statt', 'september', 'weltweit', 'vielen', 'januar', 'nachdem', 'warnt', 'große', 'versucht', 'beweise', 'teilen', 'hingegen', 'juli', 'zusammen', 'luft', 'schreiben', 'wissen', 'per', 'monaten', 'beweis', 'anhand', 'dürfen', 'vermeintlich', 'twitter', 'blog', 'falsch', 'mitte', 'aufschrift', 'februar', 'trägt', 'kurz', 'cookies', 'browser']

        # Do some inference call to engine here and return output
        if self.model_type == "TopicalPageRank":
            pos = {'NOUN', 'PROPN', 'ADJ'} # the valid Part-of-Speeches to occur in the graph, e.g. {'NOUN', 'PROPN', 'ADJ'}
            grammar = self.model_params['grammar'] # the grammar for selecting the keyphrase candidates, e.g. "NP: {<ADJ>*<NOUN|PROPN>}"
            language = self.model_params['language'] # e.g. 'de'
            normalization = self.model_params['normalization'] # word normalization method, e.g. ‘stemming’
            window = self.model_params['window'] # edges connecting two words occurring in a window are weighted by co-occurrence counts, e.g. 10
            max_count = self.model_params['max_count'] # maximal count of highest scored keyphrases, which are returned
            logging.info("TopicalPageRank model_input: {}".format(model_input))
            # 1. create a TopicalPageRank extractor.
            extractor = pke.unsupervised.TopicalPageRank()
            phrases_list = []
            for text_input in model_input:
                # 2. load the input text
                extractor.load_document(input=text_input,
                                        language=language,
                                        normalization=normalization)            
                # 3. select the noun phrases as keyphrase candidates.
                extractor.candidate_selection(grammar=grammar)
                # 4. weight the keyphrase candidates using Single Topical PageRank.
                #    Builds a word-graph in which edges connecting two words occurring
                #    in a window are weighted by co-occurrence counts.
                extractor.candidate_weighting(window=window,
                                            pos=pos,
                                            lda_model=self.model,
                                            stoplist=stoplist)
                # 5. get the highest scored candidates as keyphrases
                keyphrases = extractor.get_n_best(n=max_count)
                logging.info("text_input: {}. keyphrases: {}".format(text_input, keyphrases))
                phrases_list.append(keyphrases)
            return phrases_list
        elif self.model_type == "DocSim":
            # load model
#            with open(self.model, 'rb') as inp:
#                model = pickle.load(inp)
            # load word vectors
            model_wv = KeyedVectors.load(self.model)
            inference = []
            logging.info("DocSim model_input: {}".format(model_input))
            for text_input in model_input:
                logging.info("text_input: {}".format(text_input))
                # read string into dataframe
                df = pd.read_csv(StringIO(text_input), header=None)
                similarities = []
                for i, row in df.iterrows():
                    logging.info("{}. row: {}".format(i, row))
                    # prepare first document
                    logging.info("row.iloc[0]: {}".format(row.iloc[0]))
                    tokens = self.text_preprocess(row.iloc[0])
                    # Remove stop words
                    words1 = [w for w in tokens if not w in stoplist and w in model_wv.key_to_index]
                    logging.info("words1: {}".format(words1))
                    # prepare first document
                    logging.info("row.iloc[1]: {}".format(row.iloc[1]))
                    tokens = gensim.utils.simple_preprocess(row.iloc[1])
                    # Remove stop words
                    words2 = [w for w in tokens if not w in stoplist and w in model_wv.key_to_index]
                    logging.info("words2: {}".format(words2))
                    if (len(words1) == 0) or (len(words2) == 0):
                        similarities.append("0.00")
                        logging.warning("Word list is empty!")
                    else:
                        similarities.append(str(model_wv.n_similarity(words1, words2)))
                    logging.info("similarities: {}".format(similarities))
                inference.append(similarities)
                logging.info("inference: {}".format(inference))
            return inference
        else:
            logging.error("Model {} not supported!".format(self.model_type))
            raise RuntimeError("Model {} not supported!".format(self.model_type))
Beispiel #17
0
    def process(self):
        """
		Loads the models and outputs the n nearest neighbours
		"""

        # Extract text input
        check_words = self.parameters.get("words")
        if not check_words:
            self.dataset.update_status(
                "No words to find nearest neighbours of were provided")
            self.dataset.finish(-1)
            return

        check_words = [word.strip() for word in check_words.split(",")]

        # Extract cosine threshold
        try:
            cosine_threshold = float(self.parameters.get("cosine_threshold"))
            if cosine_threshold > 1:
                cosine_threshold = 1
            if cosine_threshold < -1:
                cosine_threshold = -1
        except ValueError:
            self.dataset.update_status(
                "Invalid number of  provided. Insert a number between -1 and 1, like 0.75"
            )
            self.dataset.finish(-1)
            return

        # Extract top n
        try:
            top_n = int(self.parameters.get("top_n"))
            if top_n == 0:
                top_n = 10
            if top_n > 100:  # Can't be more than a hundred
                top_n = 100
        except ValueError:
            self.dataset.update_status(
                "Invalid number of nearest neighbours provided")
            self.dataset.finish(-1)
            return

        # Extract crawl depth
        crawl_depth = int(self.parameters.get("crawl_depth") or 1)
        if crawl_depth < 1 or crawl_depth > 3:
            crawl_depth = 1

        results = []
        results_path = self.dataset.get_results_path()
        tmp_dir = self.dataset.get_temporary_path()

        # Go through all archived token sets and generate collocations for each
        with zipfile.ZipFile(str(self.source_file), "r") as model_archive:

            # Get the filenames and only keep those containing the model (so e.g. no vectors.npy files)
            model_files = model_archive.namelist()
            model_names = [
                model_name for model_name in model_files
                if model_name.endswith(".model")
            ]

            if not model_names:
                return

            # Extract the models and output nearest neighbour(s)
            for model_name in model_names:

                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while loading token sets")

                # Get the date
                date_string = model_name.split('.')[0]

                # Words to crawl
                crawl_words = check_words
                words_crawled = []

                # Temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes)
                tmp_file_path = tmp_dir.joinpath(model_name)
                model_archive.extract(model_name, tmp_dir)

                # Check if there's also a vectors.npy file (for large models) in the folder, and if so, extract it
                if model_name + ".vectors.npy" in model_files:
                    model_archive.extract(model_name + ".vectors.npy", tmp_dir)

                model = KeyedVectors.load(str(tmp_file_path), mmap="r")

                # Keep this loop going as long as we haven't reached the crawl limit.
                for i in range(crawl_depth):

                    new_crawl_words = []

                    # Check certain words in this model
                    for word in crawl_words:

                        # Get the nearest neigbours
                        try:
                            nearest_neighbours = model.wv.most_similar(
                                positive=[word], topn=top_n)

                        # If not in vocabulary
                        except KeyError as e:
                            results.append({
                                "source_word": word,
                                "nearest_neighbour":
                                "ERROR: input word not in this model's vocabulary, be sure to insert lemmatized or stemmed versions",
                                "cosime_similarity": 0,
                                "source_occurrences": 0,
                                "target_occurrences": 0,
                                "model": model_name,
                                "date": date_string
                            })
                            continue

                        # Get the nearest neigbours
                        for nearest_neighbour in nearest_neighbours:
                            if nearest_neighbour[
                                    1] >= cosine_threshold:  # Cosine similarity threshold check
                                results.append({
                                    "source_word":
                                    word,
                                    "nearest_neighbour":
                                    nearest_neighbour[0],
                                    "cosine_similarity":
                                    nearest_neighbour[1],
                                    "source_occurrences":
                                    model.vocab[word].
                                    count,  # How often the source word appears in the model
                                    "target_occurrences":
                                    model.vocab[nearest_neighbour[0]].
                                    count,  # How often the target word appears in the model
                                    "model":
                                    model_name,
                                    "date":
                                    date_string
                                })

                                # To check in possible next crawl
                                if nearest_neighbour[0] not in words_crawled:
                                    new_crawl_words.append(
                                        nearest_neighbour[0])

                    # After first crawl, prepare new words to check
                    crawl_words = new_crawl_words

            # Delete the temporary folder
            shutil.rmtree(tmp_dir)

        if not results:
            return

        # Generate csv and finish
        self.dataset.update_status("Writing to csv and finishing")
        self.write_csv_items_and_finish(results)
Beispiel #18
0
DEBUG = not "FILTER_PRODUCTION" in environ

data_dir = "data" if DEBUG else "/data"

if DEBUG:
    app.config["CACHE_TYPE"] = "null"
else:
    app.config["CACHE_TYPE"] = "redis"
    app.config["CACHE_REDIS_URL"] = environ["REDIS_URL"]
    app.config["CACHE_DEFAULT_TIMEOUT"] = 60 * 60 * 24 * 14  # 2 weeks

cache = Cache(app)

vecs = {}
for m in Path(data_dir).glob("*.model"):
    vecs[m.stem] = KeyedVectors.load(str(m), mmap="r")


@app.route("/typeahead/<vec_name>")
@cache.cached(query_string=True)
def typeahead(vec_name):
    q = request.args.get("q", type=str)

    if q == '':
        return jsonify({"tokens": []})

    v = vecs[vec_name]

    q = re.sub(r"\d+", "0", q)
    q = q.lower()
    len_d = len(tokens)
    for word in tokens:
        if word not in inverse_dict.keys():
            continue
        X_train_bm25[ind, inverse_dict[word][-1]] = bm25_vectorizer(
            tf_values[word], len_d, corpus_len,
            len(inverse_dict[word]) - 1)

with open('bm25_inverse_dict.json', 'w') as fp:
    json.dump(inverse_dict, fp)
np.save('X_train_bm25.npy', X_train_bm25)

#--------------------------------------------------------------------------------------------w2v block

model_file = 'araneum_none_fasttextcbow_300_5_2018.model'
model = KeyedVectors.load(model_file)


def normalize_vec(vec):
    return vec / np.linalg.norm(vec)


def create_doc_vector(text):
    # создаем вектор-маску
    lemmas = text.split()
    lemmas_vectors = np.zeros((len(lemmas), model.vector_size))
    # если слово есть в модели, берем его вектор
    for idx, lemma in enumerate(lemmas):
        if lemma in model:
            lemmas_vectors[idx] = normalize_vec(model[lemma])
    # проверка на случай, если на вход пришел пустой массив