def mapping_ap(ap_file, label, sent_file, tag_file, n, port):
    c = 0
    sNLP = StanfordNLP(port=port)
    common_aspects = topap(tag_file, n)
    with open(ap_file, "w") as f:
        csvwriter = csv.writer(f)
        with open(sent_file, "r") as g:
            for line in g:
                sent = line.strip()
                psent = remove_symbols(sent)
                wds = []
                pos = []
                for ap in common_aspects:
                    if re.search(
                            r"\b%s\b" % ap, sent,
                            flags=re.I) and 5 <= len(psent.split()) <= 100:
                        if not check_overlap(common_aspects, ap, sent):
                            if len(wds) == len(pos) == 0:
                                wds, pos = zip(*sNLP.pos(sent))
                                wds = [wd.lower() for wd in wds]
                            try:
                                if pos[wds.index(ap.split()[0])] in ["NN", "NNS", "NNP", "NNPS"] and \
                                        pos[wds.index(ap.split()[-1])] in ["NN", "NNS", "NNP", "NNPS"]:
                                    ap_rep = re.sub(r" ", r"_", ap)
                                    sent_rep = re.sub(r"\b%s\b" % ap,
                                                      ap_rep,
                                                      sent,
                                                      flags=re.I)
                                    if len(pos) == wds.index(
                                            ap.split()[-1]) + 1:
                                        csvwriter.writerow(
                                            [sent_rep, ap_rep, label])
                                        c += 1
                                        if c % 10000 == 0:
                                            print("Processing %d lines" % c)
                                    else:
                                        if pos[wds.index(ap.split()[-1]) +
                                               1] not in [
                                                   "NN", "NNS", "NNP", "NNPS"
                                               ]:
                                            csvwriter.writerow(
                                                [sent_rep, ap_rep, label])
                                            c += 1
                                            if c % 10000 == 0:
                                                print("Processing %d lines" %
                                                      c)
                            except ValueError:
                                print(sent)
                                pass
def sftokenize(readfile, port=9000):
    from corenlpsf import StanfordNLP
    sNLP = StanfordNLP(port=port)
    data = []
    c = 0
    with open(readfile, "r") as f:
        for line in f:
            sents = sNLP.ssplit(line)
            for sent in sents:
                psent = remove_symbols(" ".join(sent))
                if len(psent.split()) >= 2 and len(" ".join(sent)) >= 6:
                    data.append(" ".join(sent))
                    c += 1
                    if c % 10000 == 0:
                        print("Processing %d sentences" % c)
    return set(data)
Example #3
0
    def __init__(
            self,
            model_args="./results/booking_lstm_v4_ps.args",
            ap_file="/media/data/hotels/booking_v2/processed/extracted_tag/tag_aspects.csv",
            use_cuda=False,
            port=9000,
            thres=5):
        margs = SaveloadHP.load(model_args)
        margs.use_cuda = use_cuda

        self.i2l = {}
        for k, v in margs.vocab.l2i.items():
            self.i2l[v] = k
        print("Load Model from file: %s" % margs.model_name)
        self.classifier = Classifier(margs)
        self.classifier.model.load_state_dict(torch.load(margs.model_name))
        self.classifier.model.to(self.classifier.device)
        self.common_aspects = self.read_aspect(ap_file, thres)
        self.sNLP = StanfordNLP(port=port)
Example #4
0
class ABSA(object):
    def __init__(
            self,
            model_args="./results/booking_lstm_v4_ps.args",
            ap_file="/media/data/hotels/booking_v2/processed/extracted_tag/tag_aspects.csv",
            use_cuda=False,
            port=9000,
            thres=5):
        margs = SaveloadHP.load(model_args)
        margs.use_cuda = use_cuda

        self.i2l = {}
        for k, v in margs.vocab.l2i.items():
            self.i2l[v] = k
        print("Load Model from file: %s" % margs.model_name)
        self.classifier = Classifier(margs)
        self.classifier.model.load_state_dict(torch.load(margs.model_name))
        self.classifier.model.to(self.classifier.device)
        self.common_aspects = self.read_aspect(ap_file, thres)
        self.sNLP = StanfordNLP(port=port)

    def read_aspect(
            self,
            readfile="/media/data/hotels/booking_v2/processed/extracted_tag/tag_aspects.csv",
            thres=5):
        aspects = Counter()
        with open(readfile, "r") as f:
            csvreader = csv.reader(f)
            for row in csvreader:
                wd, cnt = row
                if int(cnt) >= thres:
                    aspects[wd] += int(cnt)
        common_aspects = []
        for a in aspects.most_common():
            common_aspects.append(a[0])
        return common_aspects

    def predict(self, date, review, rating):
        sa_info = defaultdict(list)
        sentences = self.sNLP.ssplit(review)
        for sentence in sentences:
            sentence = " ".join(sentence)
            print(100 * "=")
            print("[SENTENCE] %s" % sentence)
            print(100 * "=")
            # TODO Improve this NN extraction
            NPs = extract_np(sentence, self.sNLP)
            NPs = list(set(NPs))
            if len(NPs) != 0:
                print("(0)[NP_TRUNCATE] List of noun phrases: %s\n" %
                      ", ".join(NPs))
            else:
                print("(0)[NP_TRUNCATE] List of noun phrases: NULL\n")

            if len(NPs) != 0:
                for aspect in NPs:
                    sent_rep, asp_rep = process_sent_ap(sentence, aspect)
                    print(100 * "-")
                    label_prob, label_pred = self.classifier.predict(
                        sent_rep, asp_rep, len(self.i2l))
                    if aspect in self.common_aspects:
                        print(
                            "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f"
                            % (aspect, label_prob.item()))
                        sa_info[aspect].append(
                            (date, aspect, label_prob.item(), sentence,
                             rating))
                    else:
                        # print("\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f" % (
                        # aspect, label_prob.item()))
                        # sa_info["UND"].append((date, aspect, label_prob.item(), sentence, rating))
                        if len(aspect.split()) >= 2:
                            pos = self.sNLP.pos(aspect)
                            s, t = zip(*pos)
                            if "IN" in t:
                                subap = " ".join(s[:t.index("IN")])
                                if subap in self.common_aspects:
                                    print(
                                        "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f"
                                        % (subap, label_prob.item()))
                                    sa_info[subap].append(
                                        (date, aspect, label_prob.item(),
                                         sentence, rating))
                                else:
                                    subap = aspect.split()[-1]
                                    if subap in self.common_aspects:
                                        print(
                                            "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f"
                                            % (subap, label_prob.item()))
                                        sa_info[subap].append(
                                            (date, aspect, label_prob.item(),
                                             sentence, rating))
                                    else:
                                        print(
                                            "\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f"
                                            % (aspect, label_prob.item()))
                                        sa_info["UND"].append(
                                            (date, "UND_" + aspect,
                                             label_prob.item(), sentence,
                                             rating))
                            else:
                                subap = aspect.split()[-1]
                                if subap in self.common_aspects:
                                    print(
                                        "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f"
                                        % (subap, label_prob.item()))
                                    sa_info[subap].append(
                                        (date, aspect, label_prob.item(),
                                         sentence, rating))
                                else:
                                    subap = aspect.split()[0]
                                    if subap in self.common_aspects:
                                        print(
                                            "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f"
                                            % (subap, label_prob.item()))
                                        sa_info[subap].append(
                                            (date, aspect, label_prob.item(),
                                             sentence, rating))
                                    else:
                                        print(
                                            "\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f"
                                            % (aspect, label_prob.item()))
                                        sa_info["UND"].append(
                                            (date, "UND_" + aspect,
                                             label_prob.item(), sentence,
                                             rating))

                        else:
                            print(
                                "\t(1)[SA_PREDICTION] Polarity score of aspect 'UND_%s' is %f"
                                % (aspect, label_prob.item()))
                            sa_info["UND"].append(
                                (date, "UND_" + aspect, label_prob.item(),
                                 sentence, rating))
            else:
                aspect = "NULL"
                sent_rep, asp_rep = process_sent_ap(sentence, aspect)
                print(100 * "-")
                label_prob, label_pred = self.predict_null(sent_rep, asp_rep)
                print(
                    "\t(1)[SA_PREDICTION] Polarity score of aspect '%s' is %f"
                    % (aspect, label_prob.item()))
                sa_info[aspect].append(
                    (date, aspect, label_prob.item(), sentence, rating))
        return sa_info

    def predict_null(self, sent, asp):
        wl = self.classifier.args.vocab.wl
        ## set model in eval model
        self.classifier.model.eval()
        fake_label = [0]
        words, asp_loc = self.classifier.word2idx(sent, asp)
        word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                          pad_tok=0,
                                                          wthres=wl)
        data_tensors = Data2tensor.sort_tensors(fake_label, [asp_loc],
                                                word_ids, sequence_lengths,
                                                self.classifier.device)
        fake_label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
        word_h_n = self.classifier.model.rnn.get_all_hiddens(
            word_tensor, sequence_lengths).mean(1)
        label_score = self.classifier.model.hidden2tag(word_h_n)
        label_score = self.classifier.model.dropfinal(label_score)
        label_prob, label_pred = self.classifier.model.inference(
            label_score, len(self.i2l))
        return label_prob, label_pred
"""
Created on 2018-09-07
@author: duytinvo
"""
import os
import json
from process_data import remove_symbols
from corenlpsf import StanfordNLP
sNLP = StanfordNLP()


def tripadvisor_allsents(
        kdd11_path,
        wfile="/media/data/hotels/kdd11/extracted_rev/kdd11_reviews.txt"):
    count = 0
    kdd11_basenames = os.listdir(kdd11_path)
    with open(wfile, "w") as g:
        for kdd11_basename in kdd11_basenames:
            filename = os.path.join(kdd11_path, kdd11_basename)
            with open(filename, "r", encoding='utf-8') as f:
                try:
                    d = json.load(f)
                    revs = d["Reviews"]
                    for rev in revs:
                        con = rev.get("Content", "")
                        if con.find("showReview(") != -1:
                            continue
                        if len(con) > 0:
                            sents = [" ".join(c) for c in sNLP.ssplit(con)]
                            for sent in sents:
                                psent = remove_symbols(sent)
Example #6
0
"""
Created on 2018-08-31
@author: duytinvo
"""
from corenlpsf import count_tags, StanfordNLP

if __name__ == "__main__":
    """
    python tag_count.py --rfile /media/data/hotels/booking_v3/processed/extracted_sent/booking.positive.set.sort.txt --wfile /media/data/hotels/booking_v3/processed/extracted_tag/tag_count_dict.positive.pkl
    """
    import argparse

    argparser = argparse.ArgumentParser()

    argparser.add_argument('--rfile', help='read file',
                           default="/media/data/hotels/kdd11/processed/extracted_sent/kdd11_all_sents.txt",
                           type=str)

    argparser.add_argument('--wfile', help='writen file',
                           default="/media/data/hotels/kdd11/processed/extracted_tag/kdd11_tag_count_dict.pkl",
                           type=str)

    argparser.add_argument('--port', help='port number', default=9000, type=int)

    args = argparser.parse_args()

    # rfile = "/media/data/hotels/kdd11/processed/extracted_sent/kdd11_all_sents.txt"
    # wfile = "/media/data/hotels/kdd11/processed/extracted_tag/tag_count_dict.pkl"
    sNLP = StanfordNLP(port=args.port)
    pos_dict, pos_count = count_tags(args.wfile, args.rfile, sNLP)