def mapping_ap(ap_file, label, sent_file, tag_file, n, port): c = 0 sNLP = StanfordNLP(port=port) common_aspects = topap(tag_file, n) with open(ap_file, "w") as f: csvwriter = csv.writer(f) with open(sent_file, "r") as g: for line in g: sent = line.strip() psent = remove_symbols(sent) wds = [] pos = [] for ap in common_aspects: if re.search( r"\b%s\b" % ap, sent, flags=re.I) and 5 <= len(psent.split()) <= 100: if not check_overlap(common_aspects, ap, sent): if len(wds) == len(pos) == 0: wds, pos = zip(*sNLP.pos(sent)) wds = [wd.lower() for wd in wds] try: if pos[wds.index(ap.split()[0])] in ["NN", "NNS", "NNP", "NNPS"] and \ pos[wds.index(ap.split()[-1])] in ["NN", "NNS", "NNP", "NNPS"]: ap_rep = re.sub(r" ", r"_", ap) sent_rep = re.sub(r"\b%s\b" % ap, ap_rep, sent, flags=re.I) if len(pos) == wds.index( ap.split()[-1]) + 1: csvwriter.writerow( [sent_rep, ap_rep, label]) c += 1 if c % 10000 == 0: print("Processing %d lines" % c) else: if pos[wds.index(ap.split()[-1]) + 1] not in [ "NN", "NNS", "NNP", "NNPS" ]: csvwriter.writerow( [sent_rep, ap_rep, label]) c += 1 if c % 10000 == 0: print("Processing %d lines" % c) except ValueError: print(sent) pass
def sftokenize(readfile, port=9000): from corenlpsf import StanfordNLP sNLP = StanfordNLP(port=port) data = [] c = 0 with open(readfile, "r") as f: for line in f: sents = sNLP.ssplit(line) for sent in sents: psent = remove_symbols(" ".join(sent)) if len(psent.split()) >= 2 and len(" ".join(sent)) >= 6: data.append(" ".join(sent)) c += 1 if c % 10000 == 0: print("Processing %d sentences" % c) return set(data)
def __init__( self, model_args="./results/booking_lstm_v4_ps.args", ap_file="/media/data/hotels/booking_v2/processed/extracted_tag/tag_aspects.csv", use_cuda=False, port=9000, thres=5): margs = SaveloadHP.load(model_args) margs.use_cuda = use_cuda self.i2l = {} for k, v in margs.vocab.l2i.items(): self.i2l[v] = k print("Load Model from file: %s" % margs.model_name) self.classifier = Classifier(margs) self.classifier.model.load_state_dict(torch.load(margs.model_name)) self.classifier.model.to(self.classifier.device) self.common_aspects = self.read_aspect(ap_file, thres) self.sNLP = StanfordNLP(port=port)
class ABSA(object): def __init__( self, model_args="./results/booking_lstm_v4_ps.args", ap_file="/media/data/hotels/booking_v2/processed/extracted_tag/tag_aspects.csv", use_cuda=False, port=9000, thres=5): margs = SaveloadHP.load(model_args) margs.use_cuda = use_cuda self.i2l = {} for k, v in margs.vocab.l2i.items(): self.i2l[v] = k print("Load Model from file: %s" % margs.model_name) self.classifier = Classifier(margs) self.classifier.model.load_state_dict(torch.load(margs.model_name)) self.classifier.model.to(self.classifier.device) self.common_aspects = self.read_aspect(ap_file, thres) self.sNLP = StanfordNLP(port=port) def read_aspect( self, readfile="/media/data/hotels/booking_v2/processed/extracted_tag/tag_aspects.csv", thres=5): aspects = Counter() with open(readfile, "r") as f: csvreader = csv.reader(f) for row in csvreader: wd, cnt = row if int(cnt) >= thres: aspects[wd] += int(cnt) common_aspects = [] for a in aspects.most_common(): common_aspects.append(a[0]) return common_aspects def predict(self, date, review, rating): sa_info = defaultdict(list) sentences = self.sNLP.ssplit(review) for sentence in sentences: sentence = " ".join(sentence) print(100 * "=") print("[SENTENCE] %s" % sentence) print(100 * "=") # TODO Improve this NN extraction NPs = extract_np(sentence, self.sNLP) NPs = list(set(NPs)) if len(NPs) != 0: print("(0)[NP_TRUNCATE] List of noun phrases: %s\n" % ", ".join(NPs)) else: print("(0)[NP_TRUNCATE] List of noun phrases: NULL\n") if len(NPs) != 0: for aspect in NPs: sent_rep, asp_rep = process_sent_ap(sentence, aspect) print(100 * "-") label_prob, label_pred = self.classifier.predict( sent_rep, asp_rep, len(self.i2l)) if aspect in self.common_aspects: print( "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f" % (aspect, label_prob.item())) sa_info[aspect].append( (date, aspect, label_prob.item(), sentence, rating)) else: # print("\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f" % ( # aspect, label_prob.item())) # sa_info["UND"].append((date, aspect, label_prob.item(), sentence, rating)) if len(aspect.split()) >= 2: pos = self.sNLP.pos(aspect) s, t = zip(*pos) if "IN" in t: subap = " ".join(s[:t.index("IN")]) if subap in self.common_aspects: print( "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f" % (subap, label_prob.item())) sa_info[subap].append( (date, aspect, label_prob.item(), sentence, rating)) else: subap = aspect.split()[-1] if subap in self.common_aspects: print( "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f" % (subap, label_prob.item())) sa_info[subap].append( (date, aspect, label_prob.item(), sentence, rating)) else: print( "\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f" % (aspect, label_prob.item())) sa_info["UND"].append( (date, "UND_" + aspect, label_prob.item(), sentence, rating)) else: subap = aspect.split()[-1] if subap in self.common_aspects: print( "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f" % (subap, label_prob.item())) sa_info[subap].append( (date, aspect, label_prob.item(), sentence, rating)) else: subap = aspect.split()[0] if subap in self.common_aspects: print( "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f" % (subap, label_prob.item())) sa_info[subap].append( (date, aspect, label_prob.item(), sentence, rating)) else: print( "\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f" % (aspect, label_prob.item())) sa_info["UND"].append( (date, "UND_" + aspect, label_prob.item(), sentence, rating)) else: print( "\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f" % (aspect, label_prob.item())) sa_info["UND"].append( (date, "UND_" + aspect, label_prob.item(), sentence, rating)) else: aspect = "NULL" sent_rep, asp_rep = process_sent_ap(sentence, aspect) print(100 * "-") label_prob, label_pred = self.predict_null(sent_rep, asp_rep) print( "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f" % (aspect, label_prob.item())) sa_info[aspect].append( (date, aspect, label_prob.item(), sentence, rating)) return sa_info def predict_null(self, sent, asp): wl = self.classifier.args.vocab.wl ## set model in eval model self.classifier.model.eval() fake_label = [0] words, asp_loc = self.classifier.word2idx(sent, asp) word_ids, sequence_lengths = seqPAD.pad_sequences([words], pad_tok=0, wthres=wl) data_tensors = Data2tensor.sort_tensors(fake_label, [asp_loc], word_ids, sequence_lengths, self.classifier.device) fake_label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors word_h_n = self.classifier.model.rnn.get_all_hiddens( word_tensor, sequence_lengths).mean(1) label_score = self.classifier.model.hidden2tag(word_h_n) label_score = self.classifier.model.dropfinal(label_score) label_prob, label_pred = self.classifier.model.inference( label_score, len(self.i2l)) return label_prob, label_pred
""" Created on 2018-09-07 @author: duytinvo """ import os import json from process_data import remove_symbols from corenlpsf import StanfordNLP sNLP = StanfordNLP() def tripadvisor_allsents( kdd11_path, wfile="/media/data/hotels/kdd11/extracted_rev/kdd11_reviews.txt"): count = 0 kdd11_basenames = os.listdir(kdd11_path) with open(wfile, "w") as g: for kdd11_basename in kdd11_basenames: filename = os.path.join(kdd11_path, kdd11_basename) with open(filename, "r", encoding='utf-8') as f: try: d = json.load(f) revs = d["Reviews"] for rev in revs: con = rev.get("Content", "") if con.find("showReview(") != -1: continue if len(con) > 0: sents = [" ".join(c) for c in sNLP.ssplit(con)] for sent in sents: psent = remove_symbols(sent)
""" Created on 2018-08-31 @author: duytinvo """ from corenlpsf import count_tags, StanfordNLP if __name__ == "__main__": """ python tag_count.py --rfile /media/data/hotels/booking_v3/processed/extracted_sent/booking.positive.set.sort.txt --wfile /media/data/hotels/booking_v3/processed/extracted_tag/tag_count_dict.positive.pkl """ import argparse argparser = argparse.ArgumentParser() argparser.add_argument('--rfile', help='read file', default="/media/data/hotels/kdd11/processed/extracted_sent/kdd11_all_sents.txt", type=str) argparser.add_argument('--wfile', help='writen file', default="/media/data/hotels/kdd11/processed/extracted_tag/kdd11_tag_count_dict.pkl", type=str) argparser.add_argument('--port', help='port number', default=9000, type=int) args = argparser.parse_args() # rfile = "/media/data/hotels/kdd11/processed/extracted_sent/kdd11_all_sents.txt" # wfile = "/media/data/hotels/kdd11/processed/extracted_tag/tag_count_dict.pkl" sNLP = StanfordNLP(port=args.port) pos_dict, pos_count = count_tags(args.wfile, args.rfile, sNLP)