Exemple #1
0
def tag_tokens_using_stanford_corenlp(token_list, corenlp_server_address='http://localhost:9000'):
    # print("tag_tokens_using_stanford_corenlp started")

    tagger = CoreNLPPOSTagger(url=corenlp_server_address)

    # The piece of code below is exists to deal with a limitation of the Stanford's coreNLP Server that only
    # supports 100000 characters per server call. So this will break the text in a lot of smaller pieces and send
    # them to the server and after will unite them all in one list of tagged words ('tagged_text')
    tagged_text = []
    txt_size = len(token_list)
    i = 0
    while i < txt_size:

        if i + 6000 >= txt_size:
            tokens_to_tag = token_list[i:txt_size]
            i = txt_size
        else:
            tokens_to_tag = token_list[i:i + 6000]
            i += 6000

        tagged_text += tagger.tag(tokens_to_tag)

    # print("tag_tokens_using_stanford_corenlp ended")

    return tagged_text
class StanfordPOSAnnotator(Annotator):
    def __init__(self, config_path=DEFAULT_CONFIG_PATH):
        self.config = load_config(config_path)
        corenlp_config = self.config["data"]["stanford_corenlp"]
        self.tagger = CoreNLPPOSTagger(
            url="http://%s:%d" %
            (corenlp_config["host"], corenlp_config["port"]))

        self.pos_map = self.config["model"]["STANFORD_POS_MAP"]

    def annotate(self, annotable):
        if (annotable.__class__.__name__ == "Document"):
            return self.annotate_document(annotable)
        elif (annotable.__class__.__name__ == "Sentence"):
            return self.annotate_sentence(annotable)
        else:
            raise AnnotationError(
                "This annotator only accepts Document or Sentence annotables.")

    def annotate_document(self, document):
        for sentence in document.sentences:
            self.annotate_sentence(sentence)

    def annotate_sentence(self, sentence):
        token_list = [token.surface for token in sentence.tokens]
        tagged_tokens = self.tagger.tag(token_list)

        for i in range(len(token_list)):
            sentence.tokens[i].annotations["STANFORD_POS"] = tagged_tokens[i][
                1]

            for pos_rgx in self.pos_map:
                if (re.match(pos_rgx, tagged_tokens[i][1])):
                    sentence.tokens[i].annotations["POS"] = self.pos_map[
                        pos_rgx].split("|")[0]

            if ("POS" not in sentence.tokens[i].annotations):
                sentence.tokens[i].annotations["POS"] = "x"
Exemple #3
0
from nltk.tag.stanford import CoreNLPNERTagger, CoreNLPPOSTagger
from nltk.tokenize.stanford import CoreNLPTokenizer

stpos, stner = CoreNLPPOSTagger('http://localhost:9001'), CoreNLPNERTagger(
    'http://localhost:9001')
sttok = CoreNLPTokenizer('http://localhost:9001')

sttok.tokenize(u'你好')

stpos.tag(u'basf')

stpos.tag(sttok.tokenize(u'text'))

stner.tag(u'你好')

stner.tag(sttok.tokenize(u'你好'))
Exemple #4
0
class SelectCandidates:
    """ This program aims to select candidate words from reviews
        We picks up sentiment words and handles the negation problem
        The result will be stored in `src_folder/lexicon/candidates.json`
    """

    def __init__(self, src_folder="../data/", freq_thre=100, corenlp_path="../stanford-corenlp/",\
                 ner_path="../stanford-ner/", verbose=False):

        # initialization
        self.src = os.path.join(src_folder, "reviews/")
        self.corenlp_path = os.path.normpath(corenlp_path) + "/"
        self.stanford_ner_path = os.path.normpath(ner_path) + "/"
        self.frequency_threshold = freq_thre
        self.dst = os.path.join(src_folder, "lexicon/candidates.json")
        self.dst_allReviews = os.path.join(src_folder, "allReviews/")
        self.dst_ner_tsv = os.path.join(src_folder, "ner_tsv/")
        self.dst_ne = os.path.join(src_folder, "ne/")
        self.verbose = verbose

        # pick up sentiment words
        self.pos_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"]
        self.pos_tagged_statistics = {}

        # it is based on CoreNLP, a new version of stanford pos tagger
        self.pos_tagger = CoreNLPPOSTagger()
        self.stemmer = SnowballStemmer("english")
        self.stopwords = set(stopwords.words("english"))
        # remove `not` because we need combine `not` and sentiment words
        self.stopwords.remove("not")

    def stanford_ner(self):
        """ call stanford java ner api """

        self.merge_reviews()
        self.run_ner()
        self.find_named_entity()

    def merge_reviews(self):
        """ merge all reviews for named entity recognition """

        if self.verbose:
            print "Merging all reviews for named entity recognition" + "\n" + "-" * 80

        self.create_dir(self.dst_allReviews)

        for dirpath, dirs, files in os.walk(self.src):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json",
                    f).group(1)
                data = json.load(open(os.path.join(dirpath, f)))
                with open(os.path.join(self.dst_allReviews, filename + ".txt"),
                          "w+") as rf:
                    for r in data["reviews"]:

                        text = r["review"]
                        # remove accents
                        text = unicodedata.normalize("NFKD", text).encode(
                            "ASCII", "ignore")
                        # remove all website urls written in the review
                        text = re.sub(r"https?:\/\/.*[\r\n]*",
                                      " ",
                                      text,
                                      flags=re.MULTILINE)
                        # remove non english letters or words and numbers
                        text = re.sub(
                            r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "",
                            text)
                        # remove extra nextline
                        text = re.sub("(\\n)+", r" ", text)

                        # I'm -> I am
                        text = re.sub(r"'m ", " am ", text)
                        text = re.sub(r"'re ", " are ", text)
                        text = re.sub(r"'s ", " is ", text)
                        text = re.sub(r"'ve ", " have ", text)
                        text = re.sub(r"'d ", " would ", text)
                        text = re.sub(r" won't ", " will not ", text)
                        text = re.sub(r"n't ", " not ", text)
                        text = re.sub(r"'ll ", " will ", text)

                        # remove all punctuations except for , . ? ! ; : and -
                        # -: composite adj.
                        text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text)

                        # Space out every sign & symbol & punctuation
                        text = re.sub("([^\w\s])", r" \1 ", text)

                        text = text.replace("\'", "")
                        # remove ` - `, ` -`, `- `
                        text = re.sub(r"(\-)+", "-", text)
                        text = re.sub(
                            r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ",
                            text)
                        # turn multiple spaces into one
                        text = re.sub(r"(\s)+", " ", text)
                        # remove extra space at both ends of the text
                        text = text.strip()

                        rf.write(text)
                        rf.write("\n\n. CHANGE-REVIEW .\n\n")

    def run_ner(self):
        """ run shell to call NER """

        if self.verbose:
            print "Running shell to call Stanford NER" + "\n" + "-" * 80

        self.create_dir(self.dst_ner_tsv)

        comm = "java -mx1g -cp \"%s*:%slib/*\" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier %sclassifiers/english.all.3class.distsim.crf.ser.gz -outputFormat tabbedEntities -textFile %s > %s"
        for dirpath, dirs, files in os.walk(self.dst_allReviews):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).txt",
                    f).group(1)
                src_file = os.path.join(dirpath, f)
                dst_file = os.path.join(self.dst_ner_tsv, filename + ".tsv")
                command = comm % (self.stanford_ner_path,
                                  self.stanford_ner_path,
                                  self.stanford_ner_path, src_file, dst_file)
                subprocess.call(command, shell=True)

    def find_named_entity(self):
        """ find named entity from the ner tsv """

        if self.verbose:
            print "Finding named entity from ner tsv files" + "\n" + "-" * 80

        self.create_dir(self.dst_ne)

        for dirpath, dirs, files in os.walk(self.dst_ner_tsv):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).tsv",
                    f).group(1)
                src_file = os.path.join(dirpath, f)
                dst_file = os.path.join(self.dst_ne, filename + ".txt")
                rs = [set()]

                with open(src_file, "rb") as tsvin:
                    data = csv.reader(tsvin, delimiter="\t")
                    for r in data:
                        if len(r) != 0 and r[0] != "":
                            if r[1] == "ORGANIZATION" or r[1] == "PERSON" or r[
                                    1] == "LOCATION":
                                l = r[0].split(" ")
                                for i in l:
                                    if (i, r[1]) not in rs:
                                        rs[-1].add((i, r[1]))
                        elif len(r) > 2 and "CHANGE-REVIEW" in r[2]:
                            rs.append(set())

                with open(dst_file, "w+") as rf:
                    for rs_index in range(len(rs) - 1):
                        rf.write(str(rs_index) + ",FILEINDEX\n")
                        for i in rs[rs_index]:
                            rf.write(i[0] + "," + i[1] + "\n")

    def get_sentiment_words(self):
        """ load all reviews in src folder: data/reviews/ and merge them """

        # start Stanford CoreNLP server in a new process
        comm = "java -mx4g -cp \"%s*\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 50000"
        command = comm % (self.corenlp_path)
        proc = subprocess.Popen(command, shell=True, preexec_fn=os.setsid)
        time.sleep(10)  # wait for starting Stanford CoreNLP server

        for dirpath, dir_list, file_list in os.walk(self.src):
            if self.verbose:
                print "Walking into directory: " + str(dirpath)

            if len(file_list) > 0:
                for f in file_list:
                    # in case there is a goddamn .DS_Store file
                    if str(f) == ".DS_Store":
                        if self.verbose:
                            print "Removing " + dirpath + "/" + str(f)
                        os.remove(os.path.join(dirpath, f))
                    else:
                        with open(os.path.join(dirpath, f)) as fp:
                            entity = json.load(fp)

                    if self.verbose:
                        print "Processing " + "\033[1m" + entity[
                            "entity"] + "\033[0m" + " in " + "\033[1m" + entity[
                                "category"] + "\033[0m"
                    self.analyze_part_of_speech(entity["reviews"], f)
            else:
                if self.verbose:
                    print "No file is found in " + str(dirpath)

        os.killpg(os.getpgid(proc.pid), signal.SIGTERM)

        if self.verbose:
            print "Part of Speech Analysis on Reviews are Done"
            print "-" * 80

    def analyze_part_of_speech(self, reviews, filename):
        """ run nltk.pos_tag to analysis the part_of_speech of every word """

        ner_set = self.load_ner_tags(filename)

        for review_index in range(len(reviews)):

            text = reviews[review_index]["review"]
            # remove accents
            text = unicodedata.normalize("NFKD",
                                         text).encode("ASCII", "ignore")
            # remove all website urls written in the review
            text = re.sub(r"https?:\/\/.*[\r\n]*",
                          " ",
                          text,
                          flags=re.MULTILINE)
            # remove non english letters or words and numbers
            text = re.sub(r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "",
                          text)
            # remove extra nextline
            text = re.sub("(\\n)+", r" ", text)

            # I'm -> I am
            text = re.sub(r"'m ", " am ", text)
            text = re.sub(r"'re ", " are ", text)
            text = re.sub(r"'s ", " is ", text)
            text = re.sub(r"'ve ", " have ", text)
            text = re.sub(r"'d ", " would ", text)
            text = re.sub(r" won't ", " will not ", text)
            text = re.sub(r"n't ", " not ", text)
            text = re.sub(r"'ll ", " will ", text)

            # remove all punctuations except for , . ? ! ; : and -
            # -: composite adj.
            text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text)

            # space out every sign & symbol & punctuation
            text = re.sub("([^\w\s])", r" \1 ", text)

            text = text.replace("\'", "")
            # remove ` - `, ` -`, `- `
            text = re.sub(r"(\-)+", "-", text)
            text = re.sub(r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ",
                          text)
            # turn multiple spaces into one
            text = re.sub(r"(\s)+", " ", text)
            # remove extra space at both ends of the text
            text = text.strip()

            # tokenize
            tokenized_text = text.split(" ")
            # remove empty string
            tokenized_text = [w for w in tokenized_text if w]

            # pos tag
            # a list of word tuples # [("great", "JJ"), ("tour", "NN") ...]
            if len(tokenized_text) == 0:
                continue
            word_tuple_list = self.pos_tagger.tag(tokenized_text)

            # remove stop_words
            word_tuple_list = [(w[0].lower(), w[1]) for w in word_tuple_list
                               if w[0].lower() not in self.stopwords]
            # remove empty string
            word_tuple_list = [(w[0], w[1]) for w in word_tuple_list if w[0]]

            combine_or_not = False
            combination_front = ""
            for word_tuple in word_tuple_list:
                # putting them into dictionary
                # add 1 to value if exist
                # add key and value if not
                if word_tuple[1] not in self.pos_tags:
                    if combine_or_not:
                        if combination_front in self.pos_tagged_statistics:
                            self.pos_tagged_statistics[combination_front] += 1
                        else:
                            self.pos_tagged_statistics[combination_front] = 1
                        combine_or_not = False
                        combination_front = ""
                elif word_tuple[0] not in ner_set[review_index]:
                    if combine_or_not:
                        if combination_front:
                            combination_front += "_" + word_tuple[0]
                        else:
                            combination_front = word_tuple[0]
                    else:
                        combine_or_not = True
                        combination_front = word_tuple[0]
            if combine_or_not:
                if combination_front in self.pos_tagged_statistics:
                    self.pos_tagged_statistics[combination_front] += 1
                else:
                    self.pos_tagged_statistics[combination_front] = 1

    def stem(self, candidate_lexicon):
        """ perform stemming on candidate lexicon | candidate lexicon should be a list """

        stemmed_lexicon = []
        for word in candidate_lexicon:
            stemmed_word = self.stemmer.stem(word)
            stemmed_lexicon.append({
                "word": word,
                "stemmed_word": stemmed_word
            })
        stemmed_lexicon = sorted(stemmed_lexicon, key=lambda k: k['word'])

        if self.verbose:
            print "\nMerging stemmed duplicates"
        processed_lexicon = {}
        length = len(stemmed_lexicon)
        cnt = 0
        for word_dict in stemmed_lexicon:
            cnt += 1
            if word_dict["stemmed_word"] not in processed_lexicon:
                processed_lexicon[word_dict["stemmed_word"]] = [
                    word_dict["word"]
                ]
            else:
                processed_lexicon[word_dict["stemmed_word"]].append(
                    word_dict["word"])
            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (cnt, length))
                sys.stdout.flush()

        processed_lexicon = [{
            "stemmed_word": key,
            "word": value
        } for key, value in processed_lexicon.iteritems()]
        # sorting dictionaries by word
        processed_lexicon = sorted(processed_lexicon,
                                   key=lambda k: k["stemmed_word"])

        return processed_lexicon

    def load_ner_tags(self, filename):
        """ load named entity for files """

        filename = re.search(
            "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json",
            filename).group(1)
        ner_set = []
        with open(os.path.join(self.dst_ne, filename + ".txt"), "rb") as ne_f:
            tags = csv.reader(ne_f, delimiter=",")
            for tag in tags:
                if tag[1] == "FILEINDEX":
                    ner_set.append(set())
                else:
                    ner_set[-1].add(tag[0].lower())
        return ner_set

    def render_candidate_lexicon(self):
        """ render the candidate words """

        # filtered by self.frequency_threshold
        if self.verbose:
            print "Filtering out frequency lower than frequency_threshold" + "\n" + "-" * 80

        self.create_dir(self.dst)

        pos_tagged_words = []
        pos_tagged_words_under_thre = []
        for key in self.pos_tagged_statistics:
            if self.pos_tagged_statistics[key] > self.frequency_threshold:
                pos_tagged_words.append(key)
            else:
                pos_tagged_words_under_thre.append(key)

        if self.verbose:
            print "Stemming candidate words"
        pos_tagged_words = self.stem(pos_tagged_words)
        pos_tagged_words_under_thre = self.stem(pos_tagged_words_under_thre)

        ordered_dict_list = [[], []]
        if self.verbose:
            print "\nOrganizing candidate words"
        length = len(pos_tagged_words)
        for index in range(len(pos_tagged_words)):
            ordered_dict = OrderedDict()
            ordered_dict["index"] = index + 1
            ordered_dict["count"] = sum([
                self.pos_tagged_statistics[w]
                for w in pos_tagged_words[index]["word"]
            ])
            ordered_dict["stemmed_word"] = pos_tagged_words[index][
                "stemmed_word"]
            ordered_dict["word"] = pos_tagged_words[index]["word"]
            ordered_dict_list[0].append(NoIndent(ordered_dict))

            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (index + 1, length))
                sys.stdout.flush()

        if self.verbose:
            print "\nOrganizing candidate words <= frequency threshold"
        length = len(pos_tagged_words_under_thre)
        for index in range(len(pos_tagged_words_under_thre)):
            ordered_dict = OrderedDict()
            ordered_dict["index"] = index + 1
            ordered_dict["count"] = sum([
                self.pos_tagged_statistics[w]
                for w in pos_tagged_words_under_thre[index]["word"]
            ])
            ordered_dict["stemmed_word"] = pos_tagged_words_under_thre[index][
                "stemmed_word"]
            ordered_dict["word"] = pos_tagged_words_under_thre[index]["word"]
            ordered_dict_list[1].append(NoIndent(ordered_dict))

            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (index + 1, length))
                sys.stdout.flush()

        if self.verbose:
            print "\n" + "-" * 80
            print "Saving data to: \033[1m" + self.dst + "\033[0m"
        with open(self.dst, "w+") as f_out:
            f_out.write(
                json.dumps(ordered_dict_list, indent=4, cls=NoIndentEncoder))

    def create_dir(self, new_path):
        """ create the directory if not exist"""

        dir1 = os.path.dirname(new_path)
        if not os.path.exists(dir1):
            if self.verbose:
                print "Creating directory: " + dir1
                print "-" * 80
            os.makedirs(dir1)

    def run(self):
        print "Selecting candidate words" + "\n" + "-" * 80

        self.stanford_ner()
        self.get_sentiment_words()
        self.render_candidate_lexicon()

    def PrintException(self):
        exc_type, exc_obj, tb = sys.exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        print '    Exception in ({}, LINE {} "{}"): {}'.format(
            filename, lineno, line.strip(), exc_obj)