def load_embeddings(options):
    if options.external_embedding is not None:
        if os.path.isfile(
                os.path.join(options.saved_parameters_dir,
                             options.saved_prevectors)):
            ext_embeddings, _ = IOUtils.load_embeddings_file(
                os.path.join(options.saved_parameters_dir,
                             options.saved_prevectors),
                "pickle",
                options.lower,
            )
            return ext_embeddings
        else:
            ext_embeddings, _ = IOUtils.load_embeddings_file(
                options.external_embedding,
                options.external_embedding_type,
                options.lower,
            )
            IOUtils.save_embeddings(
                os.path.join(options.saved_parameters_dir,
                             options.saved_prevectors),
                ext_embeddings,
            )
            return ext_embeddings
    else:
        raise Exception("external_embedding option is None")
Ejemplo n.º 2
0
    def download_file(self, remote_path, local_path):
        url_format = f"{self.conn.get('protocol')}://{self.conn.get('ip')}:{self.conn.get('port')}/file"
        headers = {
            "File-Path": remote_path,
            "Content-Type": "application/octet-stream"
        }
        response = requests.get(url_format, headers=headers, stream=True, verify=self.conn.get('cert'),
                                auth=HTTPBasicAuth(self.conn.get('username'), self.conn.get('password')))
        response.raw.decode_content = True

        # error, server sent non 200 OK response code
        if response.status_code != 200:
            raise BaseException("Error: Http code: {}.".format(response.status_code))
        IOUtils.write_to_file_binary(local_path, raw_response=response.raw)

        return f"Saved at location {local_path}"
Ejemplo n.º 3
0
 def __load_external_info(self):
     docs = []
     ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file(
         self.options.external_info, "raw_text", lower=True)
     self.doclookup = self.model.add_lookup_parameters((
         len(ext_embeddings) + 3,
         ext_emb_dim))  #three of documents missing in AC-net(363, 874, 881)
     for doc_id in ext_embeddings.keys():
         self.doclookup.init_row(int(doc_id), ext_embeddings[doc_id])
Ejemplo n.º 4
0
 def __load_external_embeddings(self, embedding_file, embedding_file_type):
     ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file(
         embedding_file, embedding_file_type, lower=True)
     assert ext_emb_dim == self.wdims
     self.ext_embeddings = {}
     print("Initializing word embeddings by pre-trained vectors")
     count = 0
     for word in self.w2i:
         if word in ext_embeddings:
             count += 1
             self.ext_embeddings[word] = ext_embeddings[word]
             self.wlookup.init_row(self.w2i[word], ext_embeddings[word])
     print("Vocab size: %d; #words having pretrained vectors: %d" %
           (len(self.w2i), count))
Ejemplo n.º 5
0
    def upload_file(self, remote_path, local_path):
        url_format = f"{self.conn.get('protocol')}://{self.conn.get('ip')}:{self.conn.get('port')}/file"
        headers = {
            "File-Path": remote_path,
            "Content-Type": "application/octet-stream"
        }
        response = requests.post(url_format, headers=headers, data=IOUtils.read_file(local_path),
                                 verify=self.conn.get('cert'),
                                 auth=HTTPBasicAuth(self.conn.get('username'), self.conn.get('password')))

        # error, server sent non 200 OK response code
        if response.status_code != 200:
            raise BaseException("Error: Http code: {}. Http body: {}".format(response.status_code, response.text))

        body = response.json()

        return body.get('description')
Ejemplo n.º 6
0
def run(args):
    print('Extracting training vocabulary')
    train_w2i, _ = IOUtils.load_vocab(args.train, args.train_file_type,
                                      args.saved_parameters_dir,
                                      args.saved_vocab_train,
                                      args.external_embedding,
                                      args.external_embedding_type,
                                      args.stemmer, True)
    """
    print('Extracting test vocabulary')
    test_w2i, _ = IOUtils.load_vocab(args.test,
                                     args.test_file_type,
                                     args.saved_parameters_dir,
                                     args.saved_vocab_test,
                                     args.external_embedding,
                                     args.external_embedding_type,
                                     args.stemmer,
                                     True)
    """

    #combine test&train vocabulary
    w2i = train_w2i
    """
    for token in test_w2i:
        if token not in w2i:
            w2i[token] = len(w2i)
    """

    import matplotlib.pyplot as plt
    from itertools import cycle

    from sklearn import svm, datasets
    from sklearn.metrics import roc_curve, auc
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import label_binarize
    from sklearn.multiclass import OneVsRestClassifier
    from scipy import interp
    from matplotlib import pyplot

    from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve
    from sklearn.model_selection import KFold

    roc_scores = []
    pr_scores = []

    whole_sentences = __load_sentences(args, args.permission_type, "ACNET")
    whole_sentences = np.array(whole_sentences)
    random.shuffle(whole_sentences)

    all_predictions = []

    kfold = KFold(10, True, 1)
    for train, test in kfold.split(whole_sentences):
        print('Similarity Experiment')

        model = SimilarityExperiment(w2i, args)
        test_sentences = whole_sentences[test]
        train_sentences = whole_sentences[train]

        __train(model, train_sentences)
        __predict(model, test_sentences)

        predictions = [r.prediction_result for r in test_sentences]
        gold = []
        for r in test_sentences:
            if r.mark:
                gold.append(1)
            else:
                gold.append(0)

        y_true = np.array(gold)
        y_scores = np.array(predictions)

        roc_auc = roc_auc_score(y_true, y_scores)
        pr_auc = average_precision_score(y_true, y_scores)

        roc_scores.append(roc_auc)
        pr_scores.append(pr_auc)

        for r in test_sentences:
            mark = 1 if r.mark else 0
            all_predictions.append([
                r.sentence, r.preprocessed_sentence, mark, r.prediction_result
            ])

    roc_pr_out_dir = os.path.join(model.options.outdir, "roc_auc.txt")
    with open(roc_pr_out_dir, "w") as target:
        target.write("ROC-AUC {}\n".format(sum(roc_scores) / len(roc_scores)))
        target.write("PR-AUC {}\n".format(sum(pr_scores) / len(pr_scores)))

    predictions_dir = os.path.join(model.options.outdir, "predicted_file.txt")
    with open(predictions_dir, "w") as target:
        for p in all_predictions:
            target.write("{}\n".format(",".join(p)))
Ejemplo n.º 7
0
 def __save_model(self):
     IOUtils.save_embeddings(
         os.path.join(self.options.saved_parameters_dir,
                      self.options.saved_prevectors), self.ext_embeddings)