def __init__(self, inc_file=None, art_file=None, in_log=None): self.inc_file = inc_file self.art_file = art_file self.dataframe = None self.artifacts = None self.sif = ResSIF() self.log = in_log if in_log else logging.getLogger(__name__) self.inc_ids = [] super(ResNLP, self).__init__()
parser.add_argument("-e", "--inc_sen", help="json file of list of words for incidents", default="inc_sen.json") args, unknow_args = parser.parse_known_args() sentence = args.sentence inc_id = int(args.incident) sif_file = args.sif w2v_file = args.w2v vec_file = args.vec debug = args.debug all_ids_file = args.all_ids inc_sen_file = args.inc_sen s_util = WordSentenceUtils() sif = ResSIF() sif.load_sif(sif_file) w2v = ResNLP() w2v.load_model(w2v_file) vec = ResSen2Vec(w2v.word2vec, sif) vec.load_s2v(vec_file) inc_vec = vec.get_incident_vec(str(inc_id)) sen_vec = vec.get_vec_for_sentence(sentence) u = [] with open("vec_en_new_pcs.json", "r") as infile: u = json.load(infile) u = np.multiply(u, np.transpose(u))
"--w2v", help="trained word2vec model", default=FileManage.DEFAULT_NLP_FILE) parser.add_argument("-v", "--verbose", action="store_true") args, unknown_args = parser.parse_known_args() COEFFICIENT_TEMPLATE = u"""\t\t%-30s %s""" sen1 = args.sentence1 sen2 = args.sentence2 sif_file = args.sif w2v_file = args.w2v verbose = args.verbose sif = ResSIF() sif.load_sif(sif_file) w2v = ResNLP() w2v.load_model(w2v_file) s2v = ResSen2Vec(w2v=w2v.word2vec, sif=sif) sim = s2v.get_similarity(sen1, sen2) s_util = WordSentenceUtils() words_1 = s_util.get_words(sen1) words_2 = s_util.get_words(sen2) print("\nsen_sim:")
default=FileManage.DEFAULT_SIF_FILE) parser.add_argument("-w", "--w2v", help="trained word2vec model", default=FileManage.DEFAULT_NLP_FILE) parser.add_argument("-v", "--vec", help="saved vectors for incidents", default=FileManage.DEFAULT_VEC_FILE) args, unknown_args = parser.parse_known_args() sentence = args.sentence sif_file = args.sif w2v_file = args.w2v vec_file = args.vec num = int(args.num) sif = ResSIF() sif.load_sif(sif_file) w2v = ResNLP() w2v.load_model(w2v_file) vec = ResSen2Vec(w2v.word2vec, sif) vec.load_s2v(vec_file) closest = vec.get_closest(sentence, num) print("Find top {} closest incidents: ".format(num)) print("------------------------------") print("\n") for inc in closest:
# check_sif.py _input_word -m (optional) model_file from fn_machine_learning_nlp.lib.file_manage import FileManage from fn_machine_learning_nlp.lib.nlp.res_sif import ResSIF from fn_machine_learning_nlp.lib.nlp.res_sen2vec import ResSen2Vec import argparse parser = argparse.ArgumentParser(description="Find word count from SIF") parser.add_argument("word", help="input word") parser.add_argument("-s", "--sif", help="sif file serialized using python pickle", default=FileManage.DEFAULT_SIF_FILE) args, unknown_args = parser.parse_known_args() word = args.word sif_file = args.sif print("check-sif:") print("----------") print("Check SIF word count for \'{}\' using sif file {}:\n".format( word, sif_file)) sif = ResSIF() sif.load_sif(sif_file) count = sif.get_word_count(word) coefficient = ResSen2Vec.SIF_A / (ResSen2Vec.SIF_A + count) print("\tword count:\t\t\t{}".format(count)) print("\tcoefficient:\t\t\t{}".format(coefficient))
def get_incident_href(nlp_str, res_client, num_return, model_path, inc_id): """ For the given nlp_str, find the top num_return (old) incidents that are similar to it (from NLP point of view). Generate the href links for each of those returned incident as well. :param nlp_str: input sentence to do nlp search :param res_client: resilient client :param num_return: number of closest incidents to return :param model_path: (required) Specify the path to find the saved model :param inc_id: (new) incident id. Don't include this in return. :return: """ file_path = model_path if not file_path.endswith('/'): file_path += '/' sif_file = FileManage.DEFAULT_SIF_FILE w2v_file = FileManage.DEFAULT_NLP_FILE vec_file = FileManage.DEFAULT_VEC_FILE pca_file = FileManage.DEFAULT_PCA_FILE model_files = os.listdir(model_path) # If a custom model name was used to build, the specific files that make up the model can be identified by # the second half of the filename which is standardized depending on the type of file it is for filename in model_files: if "-sif.pkl" in filename: sif_file = filename elif "-w2v.txt" in filename: w2v_file = filename elif "-vec.json" in filename: vec_file = filename elif "-pca.json" in filename: pca_file = filename # SIF (Smooth Inverse Frequency) file sif = ResSIF() sif.load_sif(os.path.join(file_path, sif_file)) # Word2Vec NLP model nlp = ResNLP() nlp.load_model(os.path.join(file_path, w2v_file)) # sentence to vector vec = ResSen2Vec(nlp.word2vec, sif) # load cached vectors for old incidents vec.load_s2v(os.path.join(file_path, vec_file)) # load pca vec.load_pca(os.path.join(file_path, pca_file)) # find the highest inc id in the vec file. Note that the vec file contains # all the incidents at the point the model is built. We want to find incidents # created after that. highest_id = vec.get_highest_inc_id() res_utils = ResUtils(resclient=res_client) other_incidents = res_utils.get_incidents_after(highest_id) incident_ids = vec.get_closest(nlp_str, other_incidents, num_return, inc_id) hrefs = [{"inc_link": make_incident_href(inc["ref"], res_client.org_id, res_client.base_url), "similarity": inc["sim"], "keywords": inc["keywords"]} for inc in incident_ids] return hrefs
args, unknown_args = parser.parse_known_args() inc_id = int(args.id) sen_file = args.sentence ids_file = args.ids vec_file = args.vec sif_file = args.sif with open(sen_file, "r") as infile: sentences = json.load(infile) with open(ids_file, "r") as infile: ids = json.load(infile) vecs = None sif = ResSIF() loaded = sif.load_sif(sif_file) if loaded: w_c = [] sens = [sentences[i] for i in range(len(ids)) if ids[i] == inc_id] for w in sens[0]: w_c.append((w, sif.get_word_count(w))) w_c.sort(key=lambda u: u[1]) for w in w_c: print("%-20s %d" % (w[0], w[1])) else: for i in range(len(ids)): if ids[i] == inc_id: print(sentences[i])
class ResNLP(NLPWord2Vec): def __init__(self, inc_file=None, art_file=None, in_log=None): self.inc_file = inc_file self.art_file = art_file self.dataframe = None self.artifacts = None self.sif = ResSIF() self.log = in_log if in_log else logging.getLogger(__name__) self.inc_ids = [] super(ResNLP, self).__init__() def load_data(self): """ Template method to load data :return: """ self.dataframe = pds.read_csv( self.inc_file, sep=',', usecols=["id", "name", "description", "resolution_summary"], skipinitialspace=True, quotechar='"') try: # The artifacts are fetched using /search_ex. Make sure it is there. if self.art_file: self.artifacts = json.load(open(self.art_file, "r")) except Exception as e: self.artifacts = None self.log.info("Failed to load artifact file: {}".format( self.art_file)) def preprocess_data(self): """ Template method to preprocess data :return: """ self.dataset = [] self.inc_ids = [] word_utils = WordSentenceUtils() row_count = self.dataframe.shape[0] for index in range(row_count): row = self.dataframe.iloc[index] # # Retrieve the name, description, and resolution_summary from an incident # sentence = str( row["name"]) + " " + str(row["description"] + " " + str(row["resolution_summary"])) # # Retrieve the artifact value and description from an incident # inc_id = int(row["id"]) if self.artifacts is not None: artifact_des = ResUtils.get_artifact_des( inc_id, self.artifacts) sentence += artifact_des ws = word_utils.get_words(sentence) self.inc_ids.append(inc_id) self.dataset.append(ws) def build(self): """ Build word2vec, sif, :return: """ # # Build gensim word2vec model # self.build_model() # # Build SIF # self.sif.build_sif(self.dataset) def save(self, w2v_file=None, sif_file=None, s2v_file=None): """ Save word2vec, sif :return: """ # # Save gensim.word2vec # w2vfile = w2v_file if w2v_file else FileManage.DEFAULT_NLP_FILE self.save_model(w2vfile) # # Save SIF data # siffile = sif_file if sif_file else FileManage.DEFAULT_SIF_FILE self.sif.save_sif(siffile) # # Save vec cache # s2vfile = s2v_file if s2v_file else FileManage.DEFAULT_VEC_FILE sen2vec = ResSen2Vec(w2v=self.word2vec, sif=self.sif, log=self.log) sen2vec.cache_sentence_vectors(self.dataset, self.inc_ids, s2vfile)