def tokenize(self, stat_ques_dict, use_glove): token_to_ix = { 'PAD': 0, 'UNK': 1, 'CLS': 2, } spacy_tool = None pretrained_emb = [] if use_glove: spacy_tool = en_vectors_web_lg.load() pretrained_emb.append(spacy_tool('PAD').vector) pretrained_emb.append(spacy_tool('UNK').vector) pretrained_emb.append(spacy_tool('CLS').vector) max_token = 0 for qid in stat_ques_dict: ques = stat_ques_dict[qid]['question'] words = re.sub(r"([.,'!?\"()*#:;])", '', ques.lower()).replace('-', ' ').replace('/', ' ').split() if len(words) > max_token: max_token = len(words) for word in words: if word not in token_to_ix: token_to_ix[word] = len(token_to_ix) if use_glove: pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) return token_to_ix, pretrained_emb, max_token
def tokenize(stat_ques_list, use_glove): token_to_ix = { 'PAD': 0, 'UNK': 1, } spacy_tool = None pretrained_emb = [] if use_glove: spacy_tool = en_vectors_web_lg.load() pretrained_emb.append(spacy_tool('PAD').vector) pretrained_emb.append(spacy_tool('UNK').vector) for ques in stat_ques_list: words = re.sub(r"([.,'!?\"()*#:;])", '', ques['question'].lower()).replace('-', ' ').replace( '/', ' ').split() for word in words: if word not in token_to_ix: token_to_ix[word] = len(token_to_ix) if use_glove: pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) return token_to_ix, pretrained_emb
def tokenize(self, stat_caps_list, use_glove): max_token = 0 token_to_ix = { 'PAD': 0, 'UNK': 1, 'CLS': 2, } spacy_tool = None pretrained_emb = [] if use_glove: spacy_tool = en_vectors_web_lg.load() pretrained_emb.append(spacy_tool('PAD').vector) pretrained_emb.append(spacy_tool('UNK').vector) pretrained_emb.append(spacy_tool('CLS').vector) for cap in stat_caps_list: words = re.sub(r"([.,'!?\"()*#:;])", '', cap.lower()).replace('-', ' ').replace('/', ' ').split() max_token = max(len(words), max_token) for word in words: if word not in token_to_ix: token_to_ix[word] = len(token_to_ix) if use_glove: pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) return token_to_ix, pretrained_emb, max_token
def create_dict(key_to_sentence, dataroot, use_glove=True): token_file = dataroot + "/token_to_ix.pkl" glove_file = dataroot + "/train_glove.npy" if os.path.exists(glove_file) and os.path.exists(token_file): print("Loading train language files") return pickle.load(open(token_file, "rb")), np.load(glove_file) print("Creating train language files") token_to_ix = { 'UNK': 1, } spacy_tool = None pretrained_emb = [] if use_glove: spacy_tool = en_vectors_web_lg.load() pretrained_emb.append(spacy_tool('UNK').vector) for k, v in key_to_sentence.items(): for word in v: if word not in token_to_ix: token_to_ix[word] = len(token_to_ix) if use_glove: pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) np.save(glove_file, pretrained_emb) pickle.dump(token_to_ix, open(token_file, "wb")) return token_to_ix, pretrained_emb
def get_pretrained_emd_OOV(all_tokens): glove = en_vectors_web_lg.load() pretrained_emb = [] if isinstance(all_tokens, list): for token in all_tokens: pretrained_emb.append(glove(token).vector) return pretrained_emb
def calculate_similarity(src_files, bug_reports): # Loading word vectors nlp = en_vectors_web_lg.load() src_docs = [ nlp(' '.join(src.file_name['unstemmed'] + src.class_names['unstemmed'] + src.attributes['unstemmed'] + src.comments['unstemmed'] + src.method_names['unstemmed'])) for src in src_files.values() ] min_max_scaler = MinMaxScaler() all_simis = [] for report in bug_reports.values(): report_doc = nlp(' '.join(report.summary['unstemmed'] + report.pos_tagged_description['unstemmed'])) scores = [] for src_doc in src_docs: simi = report_doc.similarity(src_doc) scores.append(simi) scores = np.array([float(count) for count in scores]).reshape(-1, 1) normalized_scores = np.concatenate( min_max_scaler.fit_transform(scores)) all_simis.append(normalized_scores.tolist()) return all_simis
def get_pretrained_emb(all_tokens): glove = en_vectors_web_lg.load() pretrained_emb = [] for token_ix in all_tokens: pretrained_emb.append(glove(all_tokens[token_ix]).vector) return pretrained_emb
def tokenize(self, json_file, use_glove): token_to_ix, max_token = json.load(open(json_file, 'r'))[2:] spacy_tool = None if use_glove: spacy_tool = en_vectors_web_lg.load() pretrained_emb = [] for word in token_to_ix: if use_glove: pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) return token_to_ix, pretrained_emb, max_token
def tokenize(self, stat_ques_list, use_glove): t1 = time.time() token_to_ix = { 'PAD': 0, 'UNK': 1, 'CLS': 2, } spacy_tool = None pretrained_emb = [] if use_glove: spacy_tool = en_vectors_web_lg.load() pretrained_emb.append(spacy_tool('PAD').vector) pretrained_emb.append(spacy_tool('UNK').vector) pretrained_emb.append(spacy_tool('CLS').vector) t2 = time.time() print("first part: %f" % (t2 - t1)) for ques in stat_ques_list: words = re.sub(r"([.,'!?\"()*#:;])", '', ques['question'].lower()).replace('-', ' ').replace( '/', ' ').split() for word in words: if word not in token_to_ix: token_to_ix[word] = len(token_to_ix) if use_glove: pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) t3 = time.time() print("second part: %f" % (t3 - t2)) return token_to_ix, pretrained_emb
def __init__(self, opt, folder='./result', batchsize=64, max_length=15, mode='train'): self.opt = opt self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.mode = mode self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./%s/vdict.json' % folder, 'r') as f: self.vdict = json.load(f) with open('./%s/adict.json' % folder, 'r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) self.nlp = en_vectors_web_lg.load() self.glove_dict = {} # word -> glove vector
def tokenize(self, stat_refs_list, use_glove): token_to_ix = { 'PAD': 0, 'UNK': 1, 'CLS': 2, } spacy_tool = None pretrained_emb = [] if use_glove: spacy_tool = en_vectors_web_lg.load() pretrained_emb.append(spacy_tool('PAD').vector) pretrained_emb.append(spacy_tool('UNK').vector) pretrained_emb.append(spacy_tool('CLS').vector) for ref_ in stat_refs_list: words = ref_['tokens'] for word in words: if word not in token_to_ix: token_to_ix[word] = len(token_to_ix) if use_glove: pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) return token_to_ix, pretrained_emb
def tokenize(qns_list): token_to_ix = { 'PAD': 0, 'UNK': 1, } pretrained_emb = [] spacy_tool = en_vectors_web_lg.load() pretrained_emb.append(spacy_tool('PAD').vector) pretrained_emb.append(spacy_tool('UNK').vector) for qn in qns_list: words = re.sub(r"([.,'!?\"()*#:;])", '', qn['question'].lower()).replace('-', ' ').replace( '/', ' ').split() for word in words: if word not in token_to_ix: token_to_ix[word] = len(token_to_ix) pretrained_emb.append(spacy_tool(word).vector) pretrained_emb = np.array(pretrained_emb) return token_to_ix, pretrained_emb
def __init__(self, __C): super(DataSet, self).__init__() self.__C = __C # -------------------------- # ---- Raw data loading ---- # -------------------------- print("Loading all questions (for statistics)") # Loading question word list stat_ques_list = \ json.load(open(__C.RAW_PATH[__C.DATASET]['train'], 'r'))['questions'] + \ json.load(open(__C.RAW_PATH[__C.DATASET]['val'], 'r'))['questions'] + \ json.load(open(__C.RAW_PATH[__C.DATASET]['test'], 'r'))['questions'] + \ json.load(open(__C.RAW_PATH[__C.DATASET]['vg'], 'r'))['questions'] ''' stat_ques_list = [ {'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000}, {'image_id': 458752, 'question': 'What position is this man playing?', 'question_id': 458752001} ] ''' # Loading answer word list stat_ans_list = \ json.load(open(__C.RAW_PATH[__C.DATASET]['train-anno'], 'r'))['annotations'] + \ json.load(open(__C.RAW_PATH[__C.DATASET]['val-anno'], 'r'))['annotations'] if self.__C.USE_NEW_QUESTION == "False": print("Loading all image features") # Loading all image paths frcn_feat_path_list = \ glob.glob(__C.FEATS_PATH[__C.DATASET]['train'] + '/*.npz') + \ glob.glob(__C.FEATS_PATH[__C.DATASET]['val'] + '/*.npz') + \ glob.glob(__C.FEATS_PATH[__C.DATASET]['test'] + '/*.npz') # Loading question and answer list self.ques_list = [] self.ans_list = [] print("Loading split questions and answers") split_list = __C.SPLIT[__C.RUN_MODE].split('+') for split in split_list: self.ques_list += json.load(open(__C.RAW_PATH[__C.DATASET][split], 'r'))['questions'] #if __C.RUN_MODE in ['train']: self.ans_list += json.load(open(__C.RAW_PATH[__C.DATASET][split + '-anno'], 'r'))['annotations'] # Define run data size if __C.RUN_MODE in ['train']: self.data_size = self.ans_list.__len__() else: self.data_size = self.ques_list.__len__() # assert len(self.ques_list) == len(self.ans_list), "Size of question list and answer list does not match" print(' ========== Dataset size:', self.data_size) if self.__C.USE_NEW_QUESTION == "True" and self.__C.RUN_MODE == "test": print("Loading image features of image_id: {}".format(self.__C.IMAGE_ID)) # Loading all image paths frcn_feat_path_list = glob.glob(__C.FEATS_PATH[__C.DATASET]['test'] + '/COCO_test2015_' + str(self.__C.IMAGE_ID).zfill(12) + '.jpg.npz') print("Loading the specified question") # Loading question word list temp_json = {} temp_json['image_id'] = self.__C.IMAGE_ID temp_json['question'] = self.__C.NEW_QUESTION temp_json['question_id'] = 100000001 self.ques_list = [temp_json] self.data_size = self.ques_list.__len__() print(' ========== Dataset size:', self.data_size) # ------------------------ # ---- Data statistic ---- # ------------------------ # {image id} -> {image feature absolutely path} self.iid_to_frcn_feat_path = self.img_feat_path_load(frcn_feat_path_list) # {question id} -> {question} self.qid_to_ques = self.ques_load(self.ques_list) ''' To print 2 iterms from each dictionary from itertools import islice print( list(islice(self.qid_to_ques.items(), 2))) print( list(islice(self.iid_to_frcn_feat_path.items(), 2))) qid_to_ques = { '458752000': {'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000}, '458752001': {'image_id': 458752, 'question': 'What position is this man playing?', 'question_id': 458752001} } iid_to_frcn_feat_path = { '187465': './data/vqa/feats/train2014/COCO_train2014_000000187465.jpg.npz', '78909': './data/vqa/feats/train2014/COCO_train2014_000000078909.jpg.npz' } ''' # Tokenize # For tokenizing we need a spacy tool, declaring here self.spacy_tool = en_vectors_web_lg.load() print('') # No of different words in all questions combined print("Tokenising questions") self.token_to_ix, self.pretrained_emb = self.tokenize(stat_ques_list, __C.USE_GLOVE) self.token_size = self.token_to_ix.__len__() print(' ========== Question token vocab size:', self.token_size) # Answers statistic # Tokenize and make a vocabulary of each word in the answer as seperate tokens #Edits #Added the initialization of these two only when mode is train #if __C.RUN_MODE in ['train']: print("Tokenising answers") # No of different words in all answers combined self.token_to_ix_ans , self.pretrained_emb_ans = self.tokenize_ans(stat_ans_list, __C.USE_GLOVE) self.token_size_ans = self.token_to_ix_ans.__len__() print(" ========== Answer token vocab size: ", self.token_size_ans) ''' token_to_ix_ans = { 'PAD': 0, 'UNK': 1, 'CLS': 2, 'net': 3, 'pitcher': 4, 'orange': 5, 'yes': 6, 'white': 7, 'skiing': 8, 'red': 9, 'frisbee': 10, . . . } # To print first 50 items from itertools import islice print( list(islice(self.token_to_ix_ans.items(), 50))) sys.exit(0) ''' #ENd of our edit ans_freq = 8 self.ans_to_ix, self.ix_to_ans = self.ans_stat_from_file('openvqa/datasets/vqa/answer_dict.json') # self.ans_to_ix, self.ix_to_ans = self.ans_stat(stat_ans_list, ans_freq=ans_freq) self.ans_size = self.ans_to_ix.__len__() print(' ========== Answer token vocab size (occur more than {} times):'.format(ans_freq), self.ans_size) print('Finished!') print('')
""" calculate semantic association value value locates in [0, 1] """ # import spacy # nlp = spacy.load('en_vectors_web_lg') import en_vectors_web_lg nlp = en_vectors_web_lg.load() def SAV(word1, word2): """calculate semantic association value of two words @word1 -- the first word @word2 -- the second word @return -- a float number means semantic association value """ vector1 = nlp.vocab[word1] vector2 = nlp.vocab[word2] return vector1.similarity(vector2) def main(word1, word2): """for testing functions in this module """ print('word1: ', word1) print('word2: ', word2) print('semantic association value: {v: .2%}'.format(v=SAV(word1, word2)))
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS EPOCHS = 15 MAX_LEN = 40 LEARN_RATE = 0.001 BY_SENTENCE = True CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—' if os.name == 'nt': path = 'C:\\Users\\EGimenez\\Programming\\PyCharmProjects\\Kaggle\\data\\Tweets\\' else: path = '/content/' spell = SpellChecker() print("Loading spaCy") nlp_ents = en_core_web_sm.load() nlp_vects = en_vectors_web_lg.load() nlp_vects.add_pipe(nlp_vects.create_pipe("sentencizer")) train_df = pd.read_csv(os.path.join(path, 'train.csv')) test_df = pd.read_csv(os.path.join(path, 'test.csv')) def manage_spelling(tweets): for i, tweet in enumerate(tweets): if i % 25 == 0: print(i) line = str(tweet) for word in tweet: if not word.has_vector: