def build_model(params, with_dis): """ Build all components of the model. """ # source embeddings src_dico, _src_emb = load_embeddings(params, source=True) params.src_dico = src_dico src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True) src_emb.weight.data.copy_(_src_emb) # target embeddings if params.tgt_lang: tgt_dico, _tgt_emb = load_embeddings(params, source=False) params.tgt_dico = tgt_dico tgt_emb = nn.Embedding(len(tgt_dico), params.emb_dim, sparse=True) tgt_emb.weight.data.copy_(_tgt_emb) else: tgt_emb = None # mapping mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False) if getattr(params, 'map_id_init', True): mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim))) # normalize embeddings params.src_mean = normalize_embeddings(src_emb.weight.data, params.normalize_embeddings) if params.tgt_lang: params.tgt_mean = normalize_embeddings(tgt_emb.weight.data, params.normalize_embeddings) return src_emb, tgt_emb, mapping
def export(self): """ Export embeddings. """ params = self.params # load all embeddings params.src_dico, src_emb = load_embeddings(params, source=True, full_vocab=True) params.tgt_dico, tgt_emb = load_embeddings(params, source=False, full_vocab=True) # apply same normalization as during training normalize_embeddings(src_emb, params.normalize_embeddings, mean=params.src_mean) normalize_embeddings(tgt_emb, params.normalize_embeddings, mean=params.tgt_mean) # map source embeddings to the target space bs = 4096 for i, k in enumerate(range(0, len(src_emb), bs)): x = Variable(src_emb[k:k + bs], volatile=True) src_emb[k:k + bs] = self.mapping(x).data # write embeddings to the disk export_embeddings(src_emb, tgt_emb, params)
def load_embedding_dict(vocab_path="", vector_path="", embeddings_path="", glove=False, postspec=False): #i can ignore """ >>> _load_embedding_dict() :param vocab_path: :param vector_path: :return: embd_dict """ if glove and postspec: raise ValueError("Glove and postspec cannot both be true") elif glove: if os.name == "nt": embd_dict = utils.load_embeddings( "C:/Users/anlausch/workspace/embedding_files/glove.6B/glove.6B.300d.txt", word2vec=False) else: embd_dict = utils.load_embeddings( "/work/anlausch/glove.6B.300d.txt", word2vec=False) return embd_dict elif postspec: embd_dict_temp = utils.load_embeddings( "/work/anlausch/ft_postspec.txt", word2vec=False) embd_dict = {} for key, value in embd_dict_temp.items(): embd_dict[key.split("en_")[1]] = value assert ("test" in embd_dict) assert ("house" in embd_dict) return embd_dict elif embeddings_path.endswith("p"): with open(embeddings_path, 'rb') as handle: embd_dict = pickle.load(handle) return embd_dict # elif embeddings_path == "pickleTrue": # todo: add load from pickle file # with open('./data/embbedding_dict.p', 'rb') as handle: # embd_dict = pickle.load(handle) return embd_dict elif embeddings_path != "": embd_dict = utils.load_embeddings(embeddings_path, word2vec=False) return embd_dict else: embd_dict = {} vocab = load_vocab_goran(vocab_path) vectors = load_vectors_goran(vector_path) for term, index in vocab.items(): embd_dict[term] = vectors[index] assert len(embd_dict) == len(vocab) return embd_dict
def _init(self): logger.info("Initializing ...") self.entity2id, self.id2entity, self.entid2tags = utils.generate_entity_property_idx( self.entityPath) self.property2id, self.id2property, self.proid2tags = utils.generate_entity_property_idx( self.propertyPath) self.entid2tycid = utils.generate_entity_tyc_idx( self.tycWordsPath, self.entity2id) self.train2id = utils.generate_data_idx(self.trainPath, self.entity2id, self.property2id) self.train2id_set = set([' '.join(map(str, t)) for t in self.train2id]) # use for sampling self.conid2attrid = utils.generate_conceptid_to_attributesid( self.conceptAttrPath, self.entity2id, self.property2id, self.max_attr_size) self.conAttr2id, self.conAttr2id_set = utils.generate_concept_attributes_idx( self.conceptAttrPath, self.entity2id, self.property2id) self.dev2id = utils.generate_data_idx(self.devPath, self.entity2id, self.property2id) self.test2id = utils.generate_data_idx(self.testPath, self.entity2id, self.property2id) self.test_entity_candidate_ids = utils.read_sample_candidates( self.test_entity_candi_path, self.entity2id) self.test_attr_candidate_ids = utils.read_sample_candidates( self.test_attr_candi_path, self.property2id) self.sample_ent_cand_ids = utils.read_sample_candidates( self.sample_ent_candi_path, self.entity2id) self.sample_attr_cand_ids = utils.read_sample_candidates( self.sample_attr_candi_path, self.property2id) self.trainTotal = len(self.train2id) self.conceptAttrTotal = len(self.conid2attrid) self.devTotal = len(self.dev2id) self.testTotal = len(self.test2id) self.entityTotal = len(self.entity2id) self.propertyTotal = len(self.property2id) # tencent init if self.embeddingPath is not None: self.ent_embeddings = utils.load_embeddings( self.entity2id, self.embeddingPath, self.entityTotal, self.ent_size) self.rel_embeddings = utils.load_embeddings( self.property2id, self.embeddingPath, self.propertyTotal, self.rel_size) self.dev2id_batches = utils.get_batches(self.dev2id, self.batch_size) self.test2id_batches = utils.get_batches(self.test2id, self.batch_size)
def main(_): config = load_config(FLAGS.config) # Load saved model print "Loading model" model_path = os.path.join(config.data.ckpt, 'model.pt') model = torch.load(model_path) model.eval() # Load embeddings and (test) datasets l1_embeddings, l1_vocab = load_embeddings(path=config.data.l1_embeddings) l2_embeddings, l2_vocab = load_embeddings(path=config.data.l2_embeddings) # Translate all test files start = time.time() beam_size = 12 test_dirs = ['data/test_en', 'data/test_fr'] for test_dir in test_dirs: src_lang = test_dir.split('_')[-1] if src_lang == 'en': src_lang = 'l1' src_vocab = l1_vocab tgt_lang = 'l2' tgt_vocab = l2_vocab elif src_lang == 'fr': src_lang = 'l2' src_vocab = l2_vocab tgt_lang = 'l1' tgt_vocab = l1_vocab else: ValueError('source language') test_dataset = MonolingualDataset(folder=test_dir, vocab=src_vocab) test_loader = MonolingualDataLoader(test_dataset) test_file = test_dataset._paths[0].split('/')[2] print test_file, src_lang f = open('test_translated/' + test_file + '_translated', 'w') for i, sample in enumerate(test_loader): sample = {k: v.cuda() for k, v in sample.items() if v is not None} src, lengths, _, _ = transform_inputs( src=sample['src'], lengths=sample['src_len'], tgt=sample['src']) translated = translate(model, src, src_lang, lengths.data, beam_size, config.data.max_length, tgt_vocab) f.write(translated+'\n') f.close() print("Time to translate file (secs): ", time.time() - start)
def _get_data(config, logger, name): """ get all the required data for training/testing the classifier """ # load bert-related stuff bert_models = {'bert':'allenai/scibert_scivocab_uncased', 'roberta' : 'allenai/biomed_roberta_base'} vocab = bert_models[config['bert_model']] # load data partition, training_generator, validation_generator = load_data(config, vocab, max_len = config['max_len']) # get the embeddings: either from scratch, or from cache logger.info(f" Getting {config['embedding_type']} embeddings ...") if (config['embedding_type'] == 'bert') | (config['embedding_type'] == 'roberta'): embed_shape, train_embeddings, valid_embeddings = load_embeddings(config, name, vocab, training_generator, validation_generator) elif config['embedding_type'] == 'specter': # load from filepath embed_shape, train_embeddings, valid_embeddings = pickle.load(os.path(config["precomputed_embedding_path"])) else: raise logger.error("Only BERT, ROBERTA, and Specter embeddings accepted.") # dimension reduction: PCA (either from scratch, or from cache) if config["do_pca"]: logger.info(' Reducing embedding dimensions...') embed_shape, train_embeddings, valid_embeddings = get_pca_embeddings(config, name, train_embeddings, valid_embeddings) logger.info(' Dataset is: {} and PCA was performed: {}'.format(name, config["do_pca"])) logger.info(f'\n Num. training samples: {len(training_generator)} \ \n Num. validation samples: {len(validation_generator)}') return embed_shape, train_embeddings, valid_embeddings
def load_data(feature_type='identity', embedding_file=None): # Load graph. graph = utils.load_graph() node_ids = list(range(len(graph.nodes))) # Choose node features from identity, adjacency matrix, or embeddings. if feature_type == 'identity': node_features = np.eye(len(graph.nodes)) elif feature_type == 'adjacency': node_features = nx.to_numpy_matrix(graph, node_ids) elif feature_type == 'embedding': embedding_path = 'node2vec/embeddings/' + embedding_file embeddings = utils.load_embeddings(embedding_path) node_features = np.array([embeddings[nid] for nid in node_ids]) # Extract graph info to create torch geometric data object. x = torch.tensor(node_features, dtype=torch.float) y = torch.tensor(get_labels(graph), dtype=torch.long) edge_index, edge_attr = get_edges(graph) data = Data(x=x, edge_index=edge_index, y=y) # Obtain train/val/test splits. get_masks(data) return data
def test_kmedoids(self, emb_filename, res_filename, budget): print(res_filename) # stats = ut.graph_stats(self.G, print_stats=False) v, em = ut.load_embeddings(emb_filename, self.G.nodes()) influenced, influenced_grouped = [], [] seeds = [] for k in range(1, budget + 1): print('--------', k) S = ut.get_kmedoids_centers(em, k, v) I, I_grouped = map_fair_IC((self.G, S)) influenced.append(I) influenced_grouped.append(I_grouped) S_g = { c: [] for c in np.unique( [self.G.nodes[v]['color'] for v in self.G.nodes]) } for n in S: c = self.G.nodes[n]['color'] S_g[c].append(n) seeds.append( S_g) # id's of the seeds so the influence can be recreated ut.write_files(res_filename, influenced, influenced_grouped, seeds)
def read_files(folder, parent): print("[Local-embedding] Reading file:", parent) emb_file = '%s/embeddings.txt' % folder hier_file = '%s/hierarchy.txt' % folder keyword_file = '%s/keywords.txt' % folder ## here only consider those remaining keywords embs = utils.load_embeddings(emb_file) keywords = set() cates = {} with open(keyword_file) as f: for line in f: keywords.add(line.strip('\r\n')) tmp_embs = {} for k in keywords: if k in embs: tmp_embs[k] = embs[k] embs = tmp_embs with open(hier_file) as f: for line in f: segs = line.strip('\r\n').split(' ') if segs[1] == parent: cates[segs[0]] = set() print( '[Local-embedding] Finish reading embedding, hierarchy and keywords files.' ) return embs, keywords, cates
def main(): # 在训练集上构建一元和二元词典 word2id = load_word2id(length=VOCAB_SIZE) # 为深度学习算法准备数据loader train_loader_dl = DataLoader( dataset=DianPingDataSet("train"), batch_size=64, collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN) ) test_loader_dl = DataLoader( dataset=DianPingDataSet("test"), batch_size=64, collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN) ) vocab_size = len(word2id) print("Vocab Size:", vocab_size) print("加载词向量....") try: embedding = load_embeddings(word2id) except FileNotFoundError: embedding = None # 在深度学习模型上训练测试(CNN, LSTM) print("在BiLSTM模型上训练...") lstm_model = DeepModel(vocab_size, embedding, method="lstm") lstm_model.train_and_eval(train_loader_dl, test_loader_dl) print("在CNN模型上训练...") cnn_model = DeepModel(vocab_size, embedding, method="cnn") cnn_model.train_and_eval(train_loader_dl, test_loader_dl)
def get_related_figures(identifiers_dir, text_data_dir, embeddings_dir, test_identifiers_dir, output_dir): """Get semantically related figures for a set of test figures. Args: identifiers_dir: (string) identifiers of all figures in the collection. text_data_dir: (string) the file with the text for each figure (for keyword retrieval purposes). embeddings_dir: (string) the embedding vectors for all figures in the collection. test_identifiers_dir: (string) the figures for which we want to find related figures (a subset of full collection) output_dir: (string) directory for the output data. Returns: None. Outputs the related figures to a file. """ test_identifiers = utils.read_lines_from_file(test_identifiers_dir) all_identifiers = utils.read_lines_from_file(identifiers_dir) tf_idf_matrix = KnnSearcher.get_tf_idf_embeddings(text_data_dir) searcher = KnnSearcher(tf_idf_matrix, all_identifiers, 100) initial_result_list = searcher.perform_keyword_retrieval(test_identifiers) embedding_matrix = utils.load_embeddings(embeddings_dir) final_result_list = re_rank_with_embeddings(initial_result_list, embedding_matrix, all_identifiers) with open(output_dir, 'w+') as output_file: for figure_id in final_result_list: line = figure_id for other_figure in final_result_list[figure_id]: if other_figure[0] != figure_id: line += ',' + other_figure[0] output_file.write(line + '\n')
def predict(): source_data,target_data,test_data,word2id=utils.load_data() embeddings=utils.load_embeddings(word2id) print "测试集大小 %d" % len(test_data) results=[] #HybridCNNSS g1 = Graph('HybridCNNSS', 'HybridCNNSS1', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS2', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS3', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS4', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS5', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS6', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS7', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS8', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS9', embeddings) results.append(g1.run(test_data)) g1 = Graph('HybridCNNSS', 'HybridCNNSS10', embeddings) results.append(g1.run(test_data)) predicts=[] for predict in np.stack(results,axis=1): predicts.append(1.0*sum(predict)/len(predict)) utils.generate_file(predicts)
def recursion(root, lvl): q = Queue.Queue() q.put((root, -1, 1, '*')) dbi_scores = {} while not q.empty(): (c_folder, c_id, level, c_name) = q.get() if level >= int(lvl): continue hier_f = '%s/hierarchy.txt' % c_folder clus_kws_f = '%s/cluster_keywords.txt' % c_folder emb_f = '%s/embeddings.txt' % c_folder if not exists(hier_f): continue hier_map = utils.load_hier_f(hier_f) clus_map = get_clus_keywords(clus_kws_f) embs = utils.load_embeddings(emb_f) for cluster in hier_map: cc_id = hier_map[cluster] cluster_folder = '%s/%s' % (c_folder, cluster) cluster_namespace = '%s/%s' % (c_name, cluster) q.put((cluster_folder, cc_id, level + 1, cluster_namespace)) # handle current dbi = compute_dbi(embs, clus_map, hier_map) print 'Computing DBI for %s: %f' % (c_name, dbi) dbi_scores[c_name] = (dbi, level) output_dbi(dbi_scores)
def init(): print 'Loading training samples..' training_samples = utils.load_samples('../data/askubuntu/train_random.txt') print len(training_samples) print 'Loading dev samples..' dev_samples = utils.load_samples('../data/askubuntu/dev.txt') print len(dev_samples) print 'Loading test samples..' test_samples = utils.load_samples('../data/askubuntu/test.txt') print len(test_samples) print 'Loading corpus..' question_map = utils.load_corpus('../data/askubuntu/text_tokenized.txt') print len(question_map) print 'Loading stop words..' stop_words = utils.load_stop_words('../data/english_stop_words.txt') print len(stop_words) corpus_texts = map(lambda (t, b): t + ' ' + b, question_map.values()) print 'Loading embeddings..' embedding_map = utils.load_embeddings( '../data/pruned_askubuntu_android_vector.txt', corpus_texts, stop_words) print len(embedding_map) print utils.store_embedding_map(embedding_map) return (training_samples, dev_samples, test_samples, question_map, embedding_map)
def predict_image(): """Gets an image file via POST request, feeds the image to the FaceNet model, the resulting embedding is then sent to be compared with the embeddings database. The image file is not stored. An html page is then rendered showing the prediction result. """ if request.method == 'POST': if 'file' not in request.files: return "No file part" file = request.files['file'] filename = file.filename if filename == "": return "No selected file" if file and allowed_file(filename=filename, allowed_set=allowed_set): # Read image file as numpy array of RGB dimension img = imread(name=file, mode='RGB') # Detect and crop a 160 x 160 image containing a human face in the image file img = get_face(img=img, pnet=pnet, rnet=rnet, onet=onet, image_size=image_size) # If a human face is detected if img is not None: embedding = forward_pass( img=img, session=facenet_persistent_session, images_placeholder=images_placeholder, embeddings=embeddings, phase_train_placeholder=phase_train_placeholder, image_size=image_size) embedding_dict = load_embeddings() if embedding_dict: # Compare euclidean distance between this embedding and the embeddings in 'embeddings/' identity = identify_face(embedding=embedding, embedding_dict=embedding_dict) return render_template('predict_result.html', identity=identity) else: return render_template( 'predict_result.html', identity= "No embedding files detected! Please upload image files for embedding!" ) else: return render_template( 'predict_result.html', identity= "Operation was unsuccessful! No human face was detected.") else: return "POST HTTP method required!"
def __init__(self, emb_size, vocab_size=11004): super(Baseline_Embeddings, self).__init__() self.embedding_prem = nn.Embedding(vocab_size, emb_size) self.embedding_hypo = nn.Embedding(vocab_size, emb_size) self.linear = nn.Linear(emb_size * 2, 3) embeddings_mat = load_embeddings() self.embedding_prem.weight.data.copy_(embeddings_mat) self.embedding_hypo.weight.data.copy_(embeddings_mat)
def label_emb_centric(folder, c_id): print 'Start labeling for %s, %s ========================' % (folder, c_id) # print folder par_folder = dirname(folder) cur_label = basename(folder) p_case_f = '%s/caseolap.txt' % par_folder c_case_f = '%s/caseolap.txt' % folder emb_f = '%s/embeddings.txt' % par_folder # generate word2vec phrases embs = utils.load_embeddings(emb_f) if cur_label not in embs: print 'Error!!!' exit(1) N = 100 worst = -100 bestw = [-100] * (N + 1) bestp = [''] * (N + 1) for ph in embs: sim = utils.cossim(embs[cur_label], embs[ph]) if sim > worst: for i in range(N): if sim >= bestw[i]: for j in range(N - 1, i - 1, -1): bestw[j + 1] = bestw[j] bestp[j + 1] = bestp[j] bestw[i] = sim bestp[i] = ph worst = bestw[N - 1] break cands = [(bestp[idx], bestw[idx]) for idx, x in enumerate(bestp)] phrase_map_p, cell_map_p, tmp = read_caseolap_result(p_case_f) parent_dist_ranking = cell_map_p[c_id] parent_dist_map = {ph: float(dist) for (ph, dist) in parent_dist_ranking} child_kl_ranking = rank_phrase(c_case_f) child_kl_map = {ph: dist for (ph, dist) in child_kl_ranking} min_score = 0.12 label_cands = {} # for (ph, score) in parent_dist_ranking: for (ph, score) in cands: if ph not in parent_dist_map: continue if ph in child_kl_map: continue label_cands[ph] = score ranked_list = sorted(label_cands.items(), key=operator.itemgetter(1), reverse=True) print ranked_list return ranked_list[0][0]
def get_frame(self): success, frame_orig = self.video.read() # We are using Motion JPEG, but OpenCV defaults to capture raw images, # so we must encode it into JPEG in order to correctly display the # video stream. # faces = face_cascade.detectMultiScale(image, 1.3, 5) frame = cv2.resize(src=frame_orig, dsize=(0, 0), fx=0.5, fy=0.5) embedding_dict = load_embeddings() frame = frame[:, :, ::-1] if frame.size > 0: faces, rects = get_faces_live(img=frame, pnet=pnet, rnet=rnet, onet=onet, image_size=image_size) # If there are human faces detected if faces: for i in range(len(faces)): face_img = faces[i] rect = rects[i] # Scale coordinates of face locations by the resize ratio rect = [coordinate * 2 for coordinate in rect] face_embedding = forward_pass( img=face_img, session=facenet_persistent_session, images_placeholder=images_placeholder, embeddings=embeddings, phase_train_placeholder=phase_train_placeholder, image_size=image_size) # Compare euclidean distance between this embedding and the embeddings in 'embeddings/' identity = identify_face(embedding=face_embedding, embedding_dict=embedding_dict) cv2.rectangle(img=frame_orig, pt1=(rect[0], rect[1]), pt2=(rect[2], rect[3]), color=(255, 215, 0), thickness=2) W = int(rect[2] - rect[0]) // 2 cv2.putText(img=frame_orig, text=identity, org=(rect[0] + W - (W // 2), rect[1] - 7), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.5, color=(255, 215, 0), thickness=1, lineType=cv2.LINE_AA) ret, jpeg = cv2.imencode('.jpg', frame_orig) return jpeg.tobytes()
def main(analysis=False): # Load mappings and embeddings for specified network(s) mappings = ut.load_mappings() embeddings = ut.load_embeddings() # Extract IIDs for specified seed ingredients ingredient_to_iid = { ingredient: iid for iid, ingredient in mappings['IID_to_Ingredient_Mapping'].iteritems() } if args.seed_ingredients is not None: args.seed_ingredients = [ ingredient_to_iid[ingredient] for ingredient in args.seed_ingredients ] if args.accent > 0 and not args.network == 'ocn_fph': raise Exception( 'You set accent > 1 but did not use network \'ocn_fph\'.') num_ingredients = np.random.randint(args.min, args.max + 1) if args.accent > args.min: raise Exception( 'Number of accent ingredients cannot be greater than the minimum number of ingredients.' ) if args.cuisine == 'random': cuisine = np.random.choice( mappings['Cuisine_to_List_of_Ingredients_Mapping'].keys()) print 'Randomly Chosen Cuisine: {}'.format(cuisine) else: cuisine = args.cuisine if args.network == 'ocn_fph': recipe = base_accent_generate(get_embeddings(embeddings, 'ocn', mappings, cuisine), \ get_embeddings(embeddings, 'fph', mappings, cuisine), args.seed_ingredients, num_ingredients, args.accent) elif args.network == 'ucn': recipe = generate(get_embeddings(embeddings, 'ucn', mappings, cuisine), args.seed_ingredients, num_ingredients) else: raise NotImplementedError if args.avoids is not None: avoid_iids = [ingredient_to_iid[a] for a in args.avoids] SN, SW = ut.load_sn() recipe = substitute_avoids(SW, \ get_embeddings(embeddings, 'ocn', mappings, cuisine), avoid_iids, recipe) if analysis: return recipe base_ingredients = num_ingredients - args.accent for i, iid in enumerate(recipe): if args.network == 'ocn_fph' and i >= base_ingredients: print mappings['IID_to_Ingredient_Mapping'][iid], '(accent)' else: print mappings['IID_to_Ingredient_Mapping'][iid]
def __init__(self, pickle_path, eval_path, encoding="utf8"): self.eval_path = eval_path self.encoding = encoding self.embeddings, self.word2index = load_embeddings(pickle_path) self.top_results = min(self.embeddings.shape[0] - 2, 10)
def load_all_embeddings(self, word_index, num_words): # Word cover rate in the embedding is: 0.8724167059563099 glove_embeddings = load_embeddings(GLOVE_PATH, word_index, num_words) # Word cover rate in the embedding is: 0.6717114568599717 wiki_embeddings = load_embeddings(WIKI_PATH, word_index, num_words) # google_new_embeddings = load_embeddings(GOOGLE_NEWS_PATH, word_index, num_words) # paragram_embeddings = load_embeddings(PARAGRAM_PATH, word_index, num_words) embedding_matrix = np.concatenate( ( glove_embeddings, wiki_embeddings, # google_new_embeddings, # paragram_embeddings, ), axis=1) return torch.tensor(glove_embeddings, dtype=torch.float32)
def find_every_words_not_in_embeddings(embedding_path, vocab): oov = set() embeddings = load_embeddings(embedding_path) for token in vocab: if token not in embeddings and token.capitalize( ) not in embeddings and token.upper( ) not in embeddings and token.lower() not in embeddings: oov.add(token) return oov
def init(): print 'Loading askubuntu training samples..' askubuntu_training_samples = utils.load_samples( '../data/askubuntu/train_random.txt') print len(askubuntu_training_samples) print 'Loading askubuntu dev samples..' askubuntu_dev_samples = utils.load_samples('../data/askubuntu/dev.txt') print len(askubuntu_dev_samples) print 'Loading askubuntu test samples..' askubuntu_test_samples = utils.load_samples('../data/askubuntu/test.txt') print len(askubuntu_test_samples) print 'Loading askubuntu corpus..' askubuntu_question_map = utils.load_corpus( '../data/askubuntu/text_tokenized.txt') print len(askubuntu_question_map) print 'Loading android dev samples..' android_dev_samples = utils.load_samples_stupid_format( '../data/android/dev.pos.txt', '../data/android/dev.neg.txt') print len(android_dev_samples) print 'Loading android test samples..' android_test_samples = utils.load_samples_stupid_format( '../data/android/test.pos.txt', '../data/android/test.neg.txt') print len(android_test_samples) print 'Loading android corpus..' android_question_map = utils.load_corpus('../data/android/corpus.tsv') print len(android_question_map) print 'Loading stop words..' stop_words = utils.load_stop_words('../data/english_stop_words.txt') print len(stop_words) corpus_texts = map(lambda (t, b): t + ' ' + b, askubuntu_question_map.values() + android_question_map.values()) print 'Loading embeddings..' embedding_map = utils.load_embeddings( '../data/pruned_android_vector.txt', corpus_texts, stop_words) # pruned_askubuntu_android_vector.txt print len(embedding_map) print utils.store_embedding_map(embedding_map) return ( askubuntu_training_samples, askubuntu_dev_samples, askubuntu_test_samples, askubuntu_question_map, android_dev_samples, android_test_samples, android_question_map, embedding_map)
def train_network(vectorizer, network_type, task_type, train_table, setting_name): """ Main function of vectorization for neural network network_type : str type of the network, which should be presented in NETWORKS dictionary. task_type : str TTK_TASK or BANK_TASK train_table : str Train table filepath returns : None """ message_settings, features_settings = utils.load_embeddings() features = Features( TwitterMessageParser(message_settings, task_type), features_settings) term_vocabulary = TermVocabulary() doc_vocabulary = DocVocabulary() problem = utils.create_problem(task_type, 'train', train_table, vectorizer, features, term_vocabulary, doc_vocabulary, message_settings) assert(len(problem) > 0) X, y = get_problem(problem, get_results=True) embedding_size = X.shape[1] logging.info("embedding_size: {}".format(embedding_size)) logging.info("Create RNN network model ...") # TODO: # Network setting should be presented in json configuration (apperently # rnn.conf) hidden_size = 400 model = get_network(network_type, embedding_size, hidden_size) paths = get_model_paths(task_type, network_type, setting_name) logging.info("Pack embedding settings: {} ...".format( paths['embedding_output'])) save_embeddings(paths['embedding_output']) logging.info("Save term vocabulary: {} ...".format( paths['term_vocabulary'])) term_vocabulary.save(paths['term_vocabulary']) optimizer.train_network(model, X, y, paths['model_output'])
def __init__(self, csvpath, mode='train'): self.mode = mode df = pd.read_csv(csvpath) le = LabelEncoder() if self.mode == 'train': ''' Load the train data and split into train test sets ''' X = list(df['text']) ''' tokenize the input X_train data ''' tok = keras.preprocessing.text.Tokenizer(num_words=1000) tok.fit_on_texts(X) if self.mode == 'train': self.tok = tok # integer encode documents X_train = tok.texts_to_sequences(X) #padd so all same length X_train = keras.preprocessing.sequence.pad_sequences( X_train, padding='post') self.maxpad = X_train.shape[1] self.inp = self.X_train.values self.oup = self.list(df['target']) ''' Load word embeddings ''' word_counts = pd.DataFrame( dict(tok.word_counts), index=['count']).transpose().sort_values(by='count', ascending=False) num_words = len(word_counts) tok_dict = dict(tok.index_word) word_embeddings_dict = utils.load_embeddings( './data/non_tracked/glove.6B.100d.txt') ''' Create the embedding_matrix for the words in our vocabulary ''' embeddings_words = list(word_embeddings_dict.keys()) wordvec_dim = word_embeddings_dict[ embeddings_words[0]].shape[0] embedding_matrix = np.zeros((num_words, wordvec_dim)) for i, word in tok_dict.items(): # Look up the word embedding vector = word_embeddings_dict.get(word, None) # Record in matrix if vector is not None: embedding_matrix[i, :] = vector self.embedding_matrix = embedding_matrix else: #transform test data X_test = self.tok.texts_to_sequences(X) X_test = keras.preprocessing.sequence.pad_sequences( X_test, padding='post', maxlen=self.maxpad) self.inp = self.X_test.values
def create_oov(dataset, embeddings_path): sentences = dataset.get_train_sentences + dataset.get_valid_sentences + dataset.get_test_sentences embeddings = load_embeddings(embeddings_path) oov = sorted(set(word for sentence in sentences for word in sentence if word not in embeddings)) filepath = './' + dataset.dataset_name + '/oov.txt' with open(filepath, 'w', encoding='utf-8') as file: file.write('\n'.join(oov)+'\n')
def main(): print("Loading embeddings") #load embeddings if os.name == "nt": # embd_dict = utils.load_embeddings( # "C:/Users/anlausch/workspace/cnn-text-classification/data/GoogleNews-vectors-negative300.bin", word2vec=True) embd_dict = utils.load_embeddings( "C:/Users/anlausch/workspace/embedding_files/glove.6B/glove.6B.50d.txt", word2vec=False) else: # embd_dict = utils.load_embeddings("~/GoogleNews-vectors-negative300.bin", word2vec=True) embd_dict = utils.load_embeddings("./glove.6B.300d.txt", word2vec=False) # print("Grid Search with SVM for TFIDF Embedding Features") # print("===========================================") # for task in ["discourse", "aspect", "summary"]: # #grid_search_linear_svm_tfidf(task=task) # grid_search_rbf_svm_tfidf_embeddings(embd_dict=embd_dict, task=task) # # print("Grid Search with SVM for Embedding Features") # print("===========================================") # for task in ["discourse", "aspect", "summary"]: # #grid_search_linear_svm_tfidf(task=task) # grid_search_rbf_svm_embeddings(embd_dict=embd_dict, task=task) print("Grid Search with SVM for TFIDF") print("===========================================") for task in ["discourse", "aspect", "summary"]: #grid_search_linear_svm_tfidf(task=task) grid_search_rbf_svm_tfidf(task=task) print("Grid Search with SVM linear embeddings") print("===========================================") for task in ["discourse", "aspect", "summary"]: #grid_search_linear_svm_tfidf(task=task) grid_search_linear_svm_embeddings(embd_dict=embd_dict, task=task) print("Grid Search with SVM linear for TFIDF") print("===========================================") for task in ["discourse", "aspect", "summary"]: #grid_search_linear_svm_tfidf(task=task) grid_search_linear_svm_tfidf(task=task)
def load_embedding_dict(vocab_path="", vector_path="", embeddings_path="", glove=False, postspec=False): """ >>> _load_embedding_dict() :param vocab_path: :param vector_path: :return: embd_dict """ if glove and postspec: raise ValueError("Glove and postspec cannot both be true") elif glove: if os.name == "nt": embd_dict = utils.load_embeddings( "C:/Users/anlausch/workspace/embedding_files/glove.6B/glove.6B.300d.txt", word2vec=False) else: embd_dict = utils.load_embeddings( "/work/anlausch/glove.6B.300d.txt", word2vec=False) return embd_dict elif postspec: embd_dict_temp = utils.load_embeddings( "/work/anlausch/ft_postspec.txt", word2vec=False) embd_dict = {} for key, value in embd_dict_temp.items(): embd_dict[key.split("en_")[1]] = value assert ("test" in embd_dict) assert ("house" in embd_dict) return embd_dict elif embeddings_path != "": embd_dict = utils.load_embeddings(embeddings_path, word2vec=True) return embd_dict else: embd_dict = {} vocab = load_vocab_goran(vocab_path) vectors = load_vectors_goran(vector_path) for term, index in vocab.items(): embd_dict[term] = vectors[index] assert len(embd_dict) == len(vocab) return embd_dict
def __init__(self, task): self.ckpt_path = './ckpt/{}/'.format(task) if not os.path.exists(self.ckpt_path): os.makedirs(self.ckpt_path) source_dir = os.path.join('.', 'dataset', 'data', task) self.word_vocab, _ = load_vocab(os.path.join(source_dir, 'words.vocab')) self.char_vocab, _ = load_vocab(os.path.join(source_dir, 'chars.vocab')) self.vocab_size = len(self.word_vocab) self.char_vocab_size = len(self.char_vocab) self.label_size = load_json(os.path.join(source_dir, 'label.json'))["label_size"] self.word_emb = load_embeddings(os.path.join(source_dir, 'glove.filtered.npz'))
def predict_image(file): # file = request.files['file'] # file = os.path.join(APP_ROOT, 'uploads/Abdulrahman Safh.png') # Read image file as numpy array of RGB dimension #img = io.imread(fname=file) img = imread(name=file, mode='RGB') # Detect and crop a 160 x 160 image containing a human face in the image file faces, rects = get_faces_live(img=img, pnet=pnet, rnet=rnet, onet=onet, image_size=image_size) #global d # If there are human faces detected if faces: embedding_dict = load_embeddings() if embedding_dict: people_found = [] for i in range(len(faces)): face_img = faces[i] rect = rects[i] face_embedding = forward_pass( img=face_img, session=facenet_persistent_session, images_placeholder=images_placeholder, embeddings=embeddings, phase_train_placeholder=phase_train_placeholder, image_size=image_size ) # Compare euclidean distance between this embedding and the embeddings in 'embeddings/' identity = identify_face( embedding=face_embedding, embedding_dict=embedding_dict) people_found.append(identity) cv2.rectangle(img, (rect[0], rect[1]), (rect[2], rect[3]), (0, 255, 0), 3) W = int(rect[2] - rect[0]) // 2 H = int(rect[3] - rect[1]) // 2 cv2.putText(img, identity, (rect[0] + W - (W // 2), rect[1] - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 1, cv2.LINE_AA) # code for saving the output images # cv2.imwrite("SavedImgesFull/file_%d.jpg" % d, img) #d += 1 return people_found else: # return ["No Face"] return None # return render_template( # 'predict_result.html', # identity="No embedding files detected! Please upload image files for embedding!" # ) else: # return ["No Image"] return None
def __init__(self): # Logger self.logger = logger_init() # Use Cuda Config.cuda = True self.device = None if Config.cuda and torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') ################## Data ################### # Load Sparse Adjacency Matrix file_name = 'adj_input.pkl' (data, rows, columns, vocab_dict) = pd.read_pickle(file_name) id_word_map = {v: k for k, v in vocab_dict.items()} rel_list = ['ISA'] num_entities = len(vocab_dict) num_relations = len(rel_list) # Build the adjacency matrix and remove the edges which fre < 10. rows = rows + [i for i in range(num_entities)] columns = columns + [i for i in range(num_entities)] data = data + [1 for i in range(num_entities)] adjs = coo_matrix((data, (rows, columns)), shape=(num_entities, num_entities)).toarray() # only hyponym-hypernym candidate pairs observed more than 10 times are used to create a noisy graph. adjs = np.where(adjs >= 10, 1, 0) self.adjs = torch.FloatTensor(adjs).to(device=self.device) del rows del columns del data # Use X as index for the randomly initialized embeddings self.X = torch.LongTensor([i for i in range(num_entities) ]).to(device=self.device) # Load the word embedding if we use it. self.word_embs = load_embeddings(vocab_dict).to(device=self.device) logging.info('Finished the preprocessing') ################## Model, Optimizer, LossFunction ################### self.model = GRAPH2TAXO(num_entities, num_relations).to(device=self.device) self.opt = torch.optim.Adam(self.model.parameters(), lr=Config.learning_rate, weight_decay=Config.L2) self.f1_loss = F1_Loss().to(device=self.device) ################## Part of Hyperparameters ################### # Hyperparameters for the constraints self.lambda_A = 1.0 # 1.0 self.c_A = 0.5 # 0.5 self.tau_A = 1.0 # 1.0
def train(preproc_dir, n_classes, max_length, hidden_units, dropout, batch_size, epochs, output_dir): """ Train the ESIM model on some dataset and save the learned weights. Args: preproc_dir: The directory where the preprocessed data is saved. n_classes: The number of classes in the problem. max_length: The maximum length of the sentences in the premises and hypotheses of the dataset. hidden_units: The number of hidden units to use in the various layers of the model. dropout: The dropout rate to use in the model. batch_size: The size of the batches to use for training. epochs: The number of epochs to apply during training. output_dir: The path to the directory where the weights learned during training must be saved. """ print("Loading training and validation data...") train_premises, train_hyps, train_labels = prepare_data( preproc_dir, 'train', n_classes, max_length) valid_premises, valid_hyps, valid_labels = prepare_data( preproc_dir, 'dev', n_classes, max_length) # train_premises是如下形式: # [[5, 6, 7, 8, 9, 3, 10, 11, 12, 13, 14, 2, 15, 16, 3,0,0,0,0], # [17, 18, 19, 20, 21, 22, 4, 23, 2, 24,0,0,0,0,0,0,0,0,0], # [25, 26, 27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]] print("Loading embedding weights...") embedding_weights = load_embeddings( os.path.join(preproc_dir, "embedding_weights.pkl")) # Build the model. esim = ESIM(n_classes, embedding_weights, max_length, hidden_units, dropout) model = esim.build_model() if not os.path.exists(output_dir): os.makedirs(output_dir) filepath = os.path.join(output_dir, "weights-{epoch:02d}-{val_acc:.2f}.hdf5") checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') model.fit(x=[train_premises, train_hyps], y=train_labels, batch_size=batch_size, epochs=epochs, validation_data=([valid_premises, valid_hyps], valid_labels), callbacks=[checkpoint], shuffle=True)
def prepare_problem(vectorizer, task_type, train_table, test_table, etalon_table): """ Main function of vectorization for neural network """ message_settings, features_settings = utils.load_embeddings() features = Features( TwitterMessageParser(message_settings, task_type), features_settings) term_vocabulary = TermVocabulary() doc_vocabulary = DocVocabulary() train_problem = utils.create_problem(task_type, 'train', train_table, vectorizer, features, term_vocabulary, doc_vocabulary, message_settings) test_problem = utils.create_problem(task_type, 'test', test_table, vectorizer, features, term_vocabulary, doc_vocabulary, message_settings) return (train_problem, test_problem)
cwd = os.getcwd() vectorizer = Vectorizer(min_frequency=config.min_freq) validation_data_path = cwd + config.relative_dev_path validation_abstracts = headline2abstractdataset(validation_data_path, vectorizer, args.cuda, max_len=1000) data_path = cwd + config.relative_data_path abstracts = headline2abstractdataset(data_path, vectorizer, args.cuda, max_len=1000) print("number of training examples: %d" % len(abstracts)) vocab_size = abstracts.vectorizer.vocabulary_size embedding = nn.Embedding(vocab_size, config.emsize, padding_idx=0) if config.pretrained: embedding = load_embeddings(embedding, abstracts.vectorizer.word2idx, config.pretrained, config.emsize) context_encoder = ContextEncoder(config.context_dim, len(abstracts.context_vectorizer), config.emsize) encoder_title = EncoderRNN(vocab_size, embedding, abstracts.head_len, config.emsize, input_dropout_p=config.dropout, n_layers=config.nlayers, bidirectional=config.bidirectional, rnn_cell=config.cell) encoder = EncoderRNN(vocab_size, embedding, abstracts.abs_len, config.emsize, input_dropout_p=config.dropout, variable_lengths = False, n_layers=config.nlayers, bidirectional=config.bidirectional, rnn_cell=config.cell) decoder = DecoderRNNFB(vocab_size, embedding, abstracts.abs_len, config.emsize, sos_id=2, eos_id=1, n_layers=config.nlayers, rnn_cell=config.cell, bidirectional=config.bidirectional, input_dropout_p=config.dropout, dropout_p=config.dropout) model = FbSeq2seq(encoder_title, encoder, context_encoder, decoder) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('Model total parameters:', total_params, flush=True) if config.dataparallel and torch.cuda.device_count() > 1:
def train_network(vectorizer, network_type, task_type, train_table, test_table, etalon_table, setting_name): """ Main function of vectorization for neural network """ message_settings, features_settings = utils.load_embeddings() features = Features( TwitterMessageParser(message_settings, task_type), features_settings) term_vocabulary = TermVocabulary() doc_vocabulary = DocVocabulary() train_problem = utils.create_problem(task_type, 'train', train_table, vectorizer, features, term_vocabulary, doc_vocabulary, message_settings) test_problem = utils.create_problem(task_type, 'test', test_table, vectorizer, features, term_vocabulary, doc_vocabulary, message_settings) assert(len(train_problem) > 0 and len(test_problem) > 0) # Transform into appliable for neural network collections X_test = get_problem(test_problem, get_results=False) X_train, Y = get_problem(train_problem, get_results=True) assert(X_test.shape[1] == X_train.shape[1]) embedding_size = X_test.shape[1] logging.info("embedding_size: {}".format(embedding_size)) logging.info("Create {} network model ...".format(network_type)) # TODO: # Network setting should be presented in json configuration (apperently # rnn.conf) hidden_layer_size = 400 model = get_network(network_type, embedding_size, hidden_layer_size) paths = get_model_paths(task_type, network_type, setting_name) diagnostic_output = join(configs.NETWORK_MODELS_ROOT, "{}.diag".format(setting_name)) logging.info("Pack embedding settings: {} ...".format( paths['embedding_output'])) save_embeddings(paths['embedding_output']) def callback(model, X_test, X_train, Y, task_type, result_table, etalon_table, diagnostic_output): """ Test model """ logging.info("Testing model ...") loss = model.calculate_loss(X_train, Y) predict(model, X_test, task_type, result_table) result = check(task_type, result_table, etalon_table) logging.info("Appending results: {} ...".format(diagnostic_output)) with open(diagnostic_output, 'a') as output: output.writelines("{} {} {}\n".format( loss, result["F_macro"], result["F_micro"])) model_output = paths['model_output'] if (exists(model_output)): logging.info("Loading existed model: {} ...".format(model_output)) model.load(model_output) output_table = test_table + '.result.csv' prepare_result_table(test_table, output_table) test = lambda: callback(model, X_test, X_train, Y, task_type, output_table, etalon_table, diagnostic_output) optimizer.train_network(model, X_train, Y, model_output, callback=test) with open(diagnostic_output, 'a') as output: output.writelines("-----")
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") vocab, embeddings = utils.load_embeddings() train_data = utils.load_train_data(vocab, FLAGS.sequence_length) test_data = utils.load_test_data(vocab, FLAGS.sequence_length) print("Load done...") # Training # ================================================== prev_auc = 0 with tf.Graph().as_default(): with tf.device("/gpu:1"): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default():