def load_embeddings(options): if options.external_embedding is not None: if os.path.isfile( os.path.join(options.saved_parameters_dir, options.saved_prevectors)): ext_embeddings, _ = IOUtils.load_embeddings_file( os.path.join(options.saved_parameters_dir, options.saved_prevectors), "pickle", options.lower, ) return ext_embeddings else: ext_embeddings, _ = IOUtils.load_embeddings_file( options.external_embedding, options.external_embedding_type, options.lower, ) IOUtils.save_embeddings( os.path.join(options.saved_parameters_dir, options.saved_prevectors), ext_embeddings, ) return ext_embeddings else: raise Exception("external_embedding option is None")
def download_file(self, remote_path, local_path): url_format = f"{self.conn.get('protocol')}://{self.conn.get('ip')}:{self.conn.get('port')}/file" headers = { "File-Path": remote_path, "Content-Type": "application/octet-stream" } response = requests.get(url_format, headers=headers, stream=True, verify=self.conn.get('cert'), auth=HTTPBasicAuth(self.conn.get('username'), self.conn.get('password'))) response.raw.decode_content = True # error, server sent non 200 OK response code if response.status_code != 200: raise BaseException("Error: Http code: {}.".format(response.status_code)) IOUtils.write_to_file_binary(local_path, raw_response=response.raw) return f"Saved at location {local_path}"
def __load_external_info(self): docs = [] ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file( self.options.external_info, "raw_text", lower=True) self.doclookup = self.model.add_lookup_parameters(( len(ext_embeddings) + 3, ext_emb_dim)) #three of documents missing in AC-net(363, 874, 881) for doc_id in ext_embeddings.keys(): self.doclookup.init_row(int(doc_id), ext_embeddings[doc_id])
def __load_external_embeddings(self, embedding_file, embedding_file_type): ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file( embedding_file, embedding_file_type, lower=True) assert ext_emb_dim == self.wdims self.ext_embeddings = {} print("Initializing word embeddings by pre-trained vectors") count = 0 for word in self.w2i: if word in ext_embeddings: count += 1 self.ext_embeddings[word] = ext_embeddings[word] self.wlookup.init_row(self.w2i[word], ext_embeddings[word]) print("Vocab size: %d; #words having pretrained vectors: %d" % (len(self.w2i), count))
def upload_file(self, remote_path, local_path): url_format = f"{self.conn.get('protocol')}://{self.conn.get('ip')}:{self.conn.get('port')}/file" headers = { "File-Path": remote_path, "Content-Type": "application/octet-stream" } response = requests.post(url_format, headers=headers, data=IOUtils.read_file(local_path), verify=self.conn.get('cert'), auth=HTTPBasicAuth(self.conn.get('username'), self.conn.get('password'))) # error, server sent non 200 OK response code if response.status_code != 200: raise BaseException("Error: Http code: {}. Http body: {}".format(response.status_code, response.text)) body = response.json() return body.get('description')
def run(args): print('Extracting training vocabulary') train_w2i, _ = IOUtils.load_vocab(args.train, args.train_file_type, args.saved_parameters_dir, args.saved_vocab_train, args.external_embedding, args.external_embedding_type, args.stemmer, True) """ print('Extracting test vocabulary') test_w2i, _ = IOUtils.load_vocab(args.test, args.test_file_type, args.saved_parameters_dir, args.saved_vocab_test, args.external_embedding, args.external_embedding_type, args.stemmer, True) """ #combine test&train vocabulary w2i = train_w2i """ for token in test_w2i: if token not in w2i: w2i[token] = len(w2i) """ import matplotlib.pyplot as plt from itertools import cycle from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier from scipy import interp from matplotlib import pyplot from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve from sklearn.model_selection import KFold roc_scores = [] pr_scores = [] whole_sentences = __load_sentences(args, args.permission_type, "ACNET") whole_sentences = np.array(whole_sentences) random.shuffle(whole_sentences) all_predictions = [] kfold = KFold(10, True, 1) for train, test in kfold.split(whole_sentences): print('Similarity Experiment') model = SimilarityExperiment(w2i, args) test_sentences = whole_sentences[test] train_sentences = whole_sentences[train] __train(model, train_sentences) __predict(model, test_sentences) predictions = [r.prediction_result for r in test_sentences] gold = [] for r in test_sentences: if r.mark: gold.append(1) else: gold.append(0) y_true = np.array(gold) y_scores = np.array(predictions) roc_auc = roc_auc_score(y_true, y_scores) pr_auc = average_precision_score(y_true, y_scores) roc_scores.append(roc_auc) pr_scores.append(pr_auc) for r in test_sentences: mark = 1 if r.mark else 0 all_predictions.append([ r.sentence, r.preprocessed_sentence, mark, r.prediction_result ]) roc_pr_out_dir = os.path.join(model.options.outdir, "roc_auc.txt") with open(roc_pr_out_dir, "w") as target: target.write("ROC-AUC {}\n".format(sum(roc_scores) / len(roc_scores))) target.write("PR-AUC {}\n".format(sum(pr_scores) / len(pr_scores))) predictions_dir = os.path.join(model.options.outdir, "predicted_file.txt") with open(predictions_dir, "w") as target: for p in all_predictions: target.write("{}\n".format(",".join(p)))
def __save_model(self): IOUtils.save_embeddings( os.path.join(self.options.saved_parameters_dir, self.options.saved_prevectors), self.ext_embeddings)