Exemple #1
0
def run_classifier_on_file(fname_in, fname_out=None):
    classifier = load_classifier('all')
    lemmatizer = WordNetLemmatizer()
    text = codecs.open(fname_in).read()
    if fname_out is None:
        fh_out = sys.stdout
    else:
        fh_out = codecs.open(fname_out, 'w')
    sentences = nltk.sent_tokenize(text)
    parser = CoreNLPDependencyParser(url='http://localhost:9000')
    for sentence in sentences:
        parses = parser.parse(nltk.word_tokenize(sentence))
        for parse in parses:
            for (gov, gov_pos), rel, (dep, dep_pos) in parse.triples():
                if dep_pos in ('NN', 'NNS'):
                    lemma = lemmatizer.lemmatize(dep)
                    features = {
                        'pos': dep_pos,
                        'rel': rel,
                        'lemma': lemma,
                        'surface': dep,
                        'dom_token': gov,
                        'int_dom_token': gov
                    }
                    label = classifier.classify(features)
                    fh_out.write("%s\t%s\n" % (lemma, label))
                    print(lemma, label)
        print('')
Exemple #2
0
class CNLP:
    CNLPServerURL = 'http://localhost:9000'

    def __init__(self):
        self.parser = CoreNLPParser(url=self.CNLPServerURL)
        self.dep_parser = CoreNLPDependencyParser(url=self.CNLPServerURL)
        self.ner_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='ner')
        self.pos_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='pos')

    def getParse(self, sentence):
        if (type(sentence) == list):
            return self.parser.parse(sentence)
        else:
            return self.parser.raw_parse(sentence)

    def getDepParse(self, sentence):
        if (type(sentence) == list):
            return self.dep_parser.parse(sentence)
        else:
            return self.dep_parser.raw_parse(sentence)

    def getNERTags(self, sentence):
        if (type(sentence) != list):
            sentence = sentence.split()
        return self.ner_tagger.tag(sentence)

    def getPOSTags(self, sentence):
        if (type(sentence) == list):
            return self.pos_tagger.parse(sentence)
        else:
            return self.pos_tagger.raw_parse(sentence)
Exemple #3
0
    def dependency_overlap(self, config, sentence1, sentence2):
        """
        Computes the Jaccard similarity between the sets of extracted dependencies between words in each sentence.
        It uses the sentence in the form (word1, relation, word2) where the words have been lemmatized.
        """
        content = config['content'] if 'content' in config else False

        dep = CoreNLPDependencyParser('http://localhost:9000')
        triples1 = list(list(dep.parse(sentence1[2]))[0].triples())
        triples2 = list(list(dep.parse(sentence2[2]))[0].triples())
        lm = WordNetLemmatizer()
        if content:
            triples1 = [(src, rel, dest) for src, rel, dest in triples1
                        if src[1][0] in ['V', 'N', 'R', 'J']
                        and dest[1][0] in ['V', 'N', 'R', 'J']]
            triples2 = [(src, rel, dest) for src, rel, dest in triples2
                        if src[1][0] in ['V', 'N', 'R', 'J']
                        and dest[1][0] in ['V', 'N', 'R', 'J']]

        triples1 = [(lm.lemmatize(src[0]), rel, lm.lemmatize(dest[0]))
                    for src, rel, dest in triples1]
        triples2 = [(lm.lemmatize(src[0]), rel, lm.lemmatize(dest[0]))
                    for src, rel, dest in triples2]

        set1 = set(triples1)
        set2 = set(triples2)
        intersection = set1.intersection(set2)
        sizei = len(intersection)
        size1, size2 = len(set1), len(set2)
        try:
            return 2 * (1 / (size1 / sizei + size2 / sizei))
        except ZeroDivisionError:
            return 0
Exemple #4
0
def preprocessing(input_file):
    tagger = CoreNLPParser(url='http://localhost:9000')
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    results = []
    with open(input_file, 'r') as f:
        i = 0
        for line in f:
            print("line " + str(i))
            print(len(line))
            if not line or line == "\n":
                continue
            tagger.parser_annotator = 'tokenize,ssplit,pos,lemma,ner,depparse,coref'
            output = tagger.api_call(line)
            resolve(output)
            output = print_resolved(output)
            tokens = sent_tokenize(output)
            for token in tokens:
                parses = dep_parser.parse(list(tokens))
                print(parses)
                results.append(parses)
            i += 1
Exemple #5
0
def run_classifier_on_string(classifier, lemmatizer, text, fname_out):

    fh_out = codecs.open(fname_out, 'w')
    sentences = nltk.sent_tokenize(text)
    parser = CoreNLPDependencyParser(url='http://localhost:9000')
    for sentence in sentences:
        parses = parser.parse(nltk.word_tokenize(sentence))
        for parse in parses:
            for (gov, gov_pos), rel, (dep, dep_pos) in parse.triples():
                if dep_pos in ('NN', 'NNS'):
                    lemma = lemmatizer.lemmatize(dep)
                    features = {
                        'pos': dep_pos,
                        'rel': rel,
                        'lemma': lemma,
                        'surface': dep,
                        'dom_token': gov,
                        'int_dom_token': gov
                    }
                    label = classifier.classify(features)
                    fh_out.write("%s\t%s\n" % (lemma, label))
Exemple #6
0
            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities


# run this you have to connect to api
# go to dir - stanford-corenlp-full-2018-02-27
# the two lines below type in terminal as one line
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
# -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 &

from nltk.parse import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')
list(parser.parse(doc))  # for sentence tokenized doc
list(parser.raw_parse(doc))  # for non tokenized docs

# on tokenized list of words
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
list(pos_tagger.tag(doc))

ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
list(ner_tagger.tag(doc))

from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
list(dep_parser.parse(doc))
print(
    list(parser.parse('What is the airspeed of an unladen swallow ?'.split())))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Parse raw string.
print(list(parser.raw_parse('What is the airspeed of an unladen swallow ?')))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Neural Dependency Parser
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(
    'What is the airspeed of an unladen swallow ?'.split())
print([[(governor, dep, dependent)
        for governor, dep, dependent in parse.triples()] for parse in parses])
print(
    "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n"
)

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
print(list(parser.tokenize('What is the airspeed of an unladen swallow?')))
print(
    "\nExpected: ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']\n"
)

# POS Tagger
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
Exemple #8
0
def read_data():
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_sentences = []
    sentences = []
    for _, _, file in os.walk("../../data/parsing_corpus"):
        for filename in file:
            with open("../../data/parsing_corpus/" + filename, "r") as f:
                contents = f.read()
                contents = contents.split("\n")
                for i in range(len(contents)):
                    temp_tokenized_sentence = tokenizer.tokenize(contents[i])
                    if (len(temp_tokenized_sentence) <= 50):
                        tokenized_sentences.append(temp_tokenized_sentence)
                        sentences.append(contents[i])
    return tokenized_sentences, sentences


tokenized_sentences, sentences = read_data()
dependency_parsed = []
with open("./dependencies.txt", "w") as f:
    for i in range(len(tokenized_sentences)):
        if (tokenized_sentences[i]):
            f.write(sentences[i] + "\n")
            parses = dep_parser.parse(tokenized_sentences[i])
            for parse in parses:
                for governor, dep, dependent in parse.triples():
                    f.write("(" + governor[0] + "," + dependent[1] + ") " +
                            dep + " " + "(" + dependent[0] + "," +
                            dependent[1] + ") " + "\n")
            f.write("\n")
Exemple #9
0
from nltk.parse import CoreNLPParser
from nltk import sent_tokenize
# parser = CoreNLPParser(url='http://localhost:9000')

# print(list(parser.parse('Jack is a boy . He is handsome .'.split())))

# print(list(parser.raw_parse('Jack is a boy . He is handsome .')))

from nltk.parse.corenlp import CoreNLPDependencyParser

dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

print('I am your dad , he is also your dad .'.split())
parses = dep_parser.parse('I am your dad , he is also your dad .'.split())

print([[(governor, dep, dependent)
        for governor, dep, dependent in parse.triples()] for parse in parses])

# parser = CoreNLPParser(url='http://localhost:9000')

# print(list(parser.tokenize('What is the airspeed of an unladen swallow?')))

# pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')

# print(list(pos_tagger.tag('What is the airspeed of an unladen swallow ?'.split())))

# ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')

# print(list(ner_tagger.tag(('Rami Eid is studying at Stony Brook University in NY'.split()))))

# tagger = CoreNLPParser(url='http://localhost:9000')
Exemple #10
0
    for line in file_writer:
        sentences.append(line.split("\n")[0])

all_words = []
all_relations = []
ferr = open(jsonfile_name + '.err', mode='w')
with open(jsonfile_name, mode='w') as fjson:
    for sent in tqdm(sentences):
        each_json = {}
        each_json['sent'] = sent

        # print('----------------------')
        # sttime = timer()

        t = list(
            parser.parse(sent.split(),
                         properties={'tokenize.whitespace': 'true'}))

        # print('time for parser.parse(): {} seconds'.format(timer() - sttime))

        dot = t[0].to_dot()
        original_item = dot.split('\n')[4:-1]
        split_item = []
        for each in original_item:
            s = each.split(" ")
            split_item.append(s)

        # sttime = timer()

        three_item = []
        four_item = []
        for each in split_item:
Exemple #11
0
class DependencyParser():
    def __init__(self):
        self.cdp = CoreNLPDependencyParser()

    def find_entity_index(self, tree_list, e):
        for i, d in enumerate(tree_list):
            if d[0] == e: return i

        return -1

    def find_root_path(self, tree_list, e):
        path = []
        i = self.find_entity_index(tree_list, e)
        while True:
            ent, tag, parent, arrow = tree_list[i]
            path += [[ent, tag, arrow]]
            i = int(parent)
            if arrow is 'ROOT': break

        return path

    def _merge_path(self, p1, p2):
        rp1, rp2 = p1[::-1], p2[::-1]
        max_len = min(len(p1), len(p2))

        path = []
        for i in range(max_len):
            if rp1[i] == rp2[i]:
                m1 = len(p1) - i - 1
                m2 = i
            else:
                m1 = len(p1) - i
                m2 = i - 1
                break

        path = p1[:m1] + rp2[m2:]
        return path, m1

    def fix_transition_and_direction(self, path, mp):
        path[mp][2] = 'end'
        for i in range(mp,
                       len(path) -
                       1):  # shift forward the transition tag from merge point
            path[i][2] = path[i + 1][2]
            path[i + 1][2] = 'end'

        for i in range(
                len(path)):  # before merge point: 1, after merge point: 0
            path[i][2] = path[i][2].split(':')[0]
            if i < mp: path[i][2] += '-inv'

        return path

    def merge_path(self, p1, p2):
        path, mp = self._merge_path(p1, p2)

        if len(path) == 0 or mp < 0:
            print("Can't merge two path")

        else:
            path = self.fix_transition_and_direction(path, mp)

        return path

    def find_dependency_tree(self, sentence):
        tree, = self.cdp.parse(sentence.split())  # CoreNLPDependencyParser
        tree_list = [
            ('', 'ROOT', '0', 'ROOT')
        ] + [tuple(r.split('\t')) for r in tree.to_conll(4).split('\n')][:-1]

        return tree_list, tree

    def find_dependency_path(self, tree_list, entities):
        e1, e2 = entities
        p1 = self.find_root_path(tree_list, e1)
        p2 = self.find_root_path(tree_list, e2)

        return self.merge_path(p1, p2)

    def split_path_content(self, path, pos_tags, dep_tags):
        path_sent, pos_seq, dep_seq = '', [], []

        for node in path:
            word, pt, dt = node
            path_sent += word + ' '
            if pt not in pos_tags: pos_tags += [pt]
            if dt not in dep_tags: dep_tags += [dt]
            pos_seq += [pos_tags.index(pt)]
            dep_seq += [dep_tags.index(dt)]

        return path_sent, pos_seq, dep_seq, pos_tags, dep_tags
Exemple #12
0
class KitchenTools():
    def __init__(self, tool, file):

        self.t_name = tool
        self.t_synset = wn.synset(self.t_name)
        self.hypernyms = []
        self.level = 0

        self.file = file

        self.parser = CoreNLPParser(url='http://localhost:9000')
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

        self.telic_roles = []
        self.first_words = []

    def parse_def(self):

        definition = self.t_synset.definition()
        self.file.write(self.t_name + ': ')
        self.file.write(definition + '\n')
        parses = self.dep_parser.parse(word_tokenize(definition))
        deps = [[(gov, rel, dep) for gov, rel, dep in parse.triples()]
                for parse in parses]
        deps = deps[0]

        for d in deps:
            self.file.write(str(d) + '\n')

        self.find_first_words(deps)
        if len(self.first_words) != 0:
            for first_word in self.first_words:
                self.telic_roles.extend(
                    self.recursive_depth_search(deps, first_word))
        kich_tool.file.write(' '.join(self.telic_roles) + '\n')

        self.file.write('\n')

    def find_first_words(self, deps):
        '''for d in deps:
            if d[2][0] == 'for' and (d[1] == 'mark' or d[1] == 'case'):
                self.first_words.append(d[0][0])
        if len(self.first_words) == 0:'''
        for d in deps:
            if d[0][0] == 'used' and (d[1] == 'xcomp' or d[1] == 'advcl'
                                      or d[1] == 'nmod'):
                self.first_words.append(d[2][0])

    def recursive_depth_search(self, deps, next_word):
        put_in_front = ['case', 'det', 'mark', 'amod', 'compound']
        left_que = queue.Queue()
        right_que = queue.Queue()
        left_str = list()
        right_str = list()
        for dep in deps:
            if dep[0][0] == next_word:
                if dep[1] in put_in_front:
                    left_que.put(dep[2][0])
                else:
                    right_que.put(dep[2][0])
        while left_que.empty() is False:
            d = left_que.get()
            left_part = self.recursive_depth_search(deps, d)
            if left_part is not None:
                left_str.extend(left_part)
        while right_que.empty() is False:
            d = right_que.get()
            right_part = self.recursive_depth_search(deps, d)
            if right_part is not None:
                right_str.extend(right_part)
        left_str.append(next_word)
        if len(right_str) != 0:
            left_str.extend(right_str)
        return left_str
from nltk.parse import CoreNLPParser

# Lexical Parser
parser = CoreNLPParser(url='http://localhost:9000')

list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))
'''

# Question Generation/Answering - Neural Dependency
from nltk.parse.corenlp import CoreNLPDependencyParser

str1 = 'Obama is the President .'

dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(str1.split())

#[[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]

for parse in parses:
    for governor, dep, dependent in parse.triples():
        print(governor, dep, dependent)

        if (dep == 'nsubj') & (dependent[1] == 'NNP'):
            person = dependent[0]

list1 = str1.split()

num = list1.index(person)
list1[num] = 'Who'
str1_out = ' '.join(list1)
Exemple #14
0
        for sent_token in sent_tokens:

            tokens = [e1.lower() for e1 in word_tokenize(sent_token)]

            inSentToken = False
            for token in tokens:
                if aspect == token:
                    inSentToken = True

            # If the word isn't in the sentence chunk then don't consider it
            if not inSentToken:
                continue

            # Tokenize and then retrieve the first parse tree
            parses = parser.parse(tokens)
            for parse in parses:
                parse_triples = [
                    (governor, dep, dependent)
                    for governor, dep, dependent in parse.triples()
                ]

                # Check to see if it's the subject at least once
                for trip in parse_triples:
                    if trip[1] == "nsubj":
                        subj = trip[2][0]
                        pos = trip[2][1]
                        if subj == aspect and "NN" in pos:
                            true_aspect = True
                            break
                    elif trip[1] == "compound":
Exemple #15
0
class ExternalDataLoader:
    def __init__(self, config):
        self.ontology_tagging = OntologyTagging()
        self.config = config
        self.word_dictionary = self.compute_all_embeddings()
        self.server_url = 'http://localhost:9000'
        self.parser = CoreNLPParser(url=self.server_url)
        self.core_nlp_dependency_parser = CoreNLPDependencyParser(
            url=self.server_url)

    def load_external_data(self, load_external_file_name,
                           write_internal_file_name):

        if not os.path.isfile(load_external_file_name):
            raise ("[!] Data %s not found" % load_external_file_name)

        xml_tree = elementTree.parse(load_external_file_name)
        root = xml_tree.getroot()

        opinion_counter = 0
        total_counter = 0

        all_sentences = []

        for sentence in root.iter('sentence'):

            sentence_id = sentence.get('id')

            original_sentence = sentence.find('text').text

            tokenized_sentence = list(self.parser.tokenize(original_sentence))

            aspects = []
            aspect_indices = []
            polarities = []
            polarity_matrix = []
            categories = []
            category_matrix = []

            for opinions in sentence.iter('Opinions'):

                for opinion in opinions.findall('Opinion'):
                    total_counter += 1
                    aspect = opinion.get('target')
                    if aspect != "NULL":

                        opinion_counter += 1

                        aspects.append(aspect)
                        category = opinion.get('category')
                        polarity = opinion.get('polarity')

                        categories.append(category)
                        polarities.append(polarity)

                        tokenized_aspect = list(self.parser.tokenize(aspect))
                        aspect_indices.append(
                            self.get_aspect_indices(tokenized_aspect,
                                                    tokenized_sentence))
                        polarity_matrix.append(
                            self.get_polarity_number(polarity))
                        category_matrix.append(
                            self.get_category_number(category))

            if len(aspects) != 0:

                print("opinion_counter ", opinion_counter)

                sentiment_distribution = self.annotate(original_sentence,
                                                       properties={
                                                           "annotators":
                                                           "sentiment",
                                                           "outputFormat":
                                                           "json",
                                                       })

                processed_sentence = self.process_characters(
                    tokenized_sentence)

                lemmatized_sentence, part_of_speech_sentence, aspect_dependencies, sentence_negation, sentiments = \
                    self.lemmatize_and_pos_tagging(processed_sentence, aspect_indices)

                ontology_classes_sentence = self.ontology_tagging.ontology_classes_tagging(
                    lemmatized_sentence)

                mentions = self.ontology_tagging.mention_tagging(
                    ontology_classes_sentence)

                ont_sentiments_sentence, aspect_sentiments_sentence, sentiments_sentence, relations_sentence = \
                    self.ontology_tagging.polarity_and_aspect_relation_tagging(ontology_classes_sentence,
                                                                               aspect_indices, categories,
                                                                               aspect_dependencies, sentiments)

                word_embedding_sentence = self.compute_word_embeddings(
                    lemmatized_sentence)

                dict_sentence = {
                    'sentence_id': sentence_id,
                    'original_sentence': original_sentence,
                    'lemmatized_sentence': lemmatized_sentence,
                    'sentiment_distribution': sentiment_distribution,
                    'part_of_speech_tags': part_of_speech_sentence,
                    'negation_in_sentence': sentence_negation,
                    'word_polarities': ont_sentiments_sentence,
                    'aspect_sentiments': aspect_sentiments_sentence,
                    'word_sentiments': sentiments_sentence,
                    'word_mentions': mentions,
                    'aspect_relations': relations_sentence,
                    'aspects': aspects,
                    'aspect_indices': aspect_indices,
                    'polarities': polarities,
                    'polarity_matrix': polarity_matrix,
                    'categories': categories,
                    'category_matrix': category_matrix,
                    'word_embeddings': word_embedding_sentence
                }
                all_sentences.append(dict_sentence)

        with open(write_internal_file_name, 'w') as outfile:
            json.dump(all_sentences, outfile, ensure_ascii=False)

    def get_polarity_number(self, polarity):

        if polarity == "positive":
            return [1, 0, 0]
        elif polarity == "neutral":
            return [0, 1, 0]
        elif polarity == "negative":
            return [0, 0, 1]
        else:
            raise Exception("Polarity ", polarity, " is not in the sentence.")

    def get_category_number(self, category):

        if category == "AMBIENCE#GENERAL":
            return [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "DRINKS#PRICES":
            return [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "DRINKS#QUALITY":
            return [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "DRINKS#STYLE_OPTIONS":
            return [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#GENERAL":
            return [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#PRICES":
            return [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#QUALITY":
            return [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#STYLE_OPTIONS":
            return [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
        elif category == "LOCATION#GENERAL":
            return [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
        elif category == "RESTAURANT#GENERAL":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
        elif category == "RESTAURANT#MISCELLANEOUS":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
        elif category == "RESTAURANT#PRICES":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
        elif category == "SERVICE#GENERAL":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
        else:
            raise Exception("Category ", category, " is not in the sentence.")

    @staticmethod
    def get_aspect_indices(aspect, sentence):

        number_words_in_aspect = len(aspect)
        number_words_in_sentence = len(sentence)

        for i in range(number_words_in_sentence):

            if aspect[0] == sentence[i]:
                return list(range(i, i + number_words_in_aspect))

        raise Exception("Aspect ", aspect, " is not in the sentence ",
                        sentence)

    def compute_all_embeddings(self):

        word_dictionary = {}

        with open(self.config.glove_embeddings, 'r', encoding="utf8") as f:
            for line in f:
                word_embedding = line.strip().split()
                word_dictionary[word_embedding[0]] = list(
                    map(float, word_embedding[1:]))

        return word_dictionary

    def compute_word_embeddings(self, sentence):

        number_words_in_sentence = len(sentence)
        word_embeddings = np.random.normal(0, 0.05,
                                           [number_words_in_sentence, 300])

        for word_index in range(number_words_in_sentence):

            if sentence[word_index] in self.word_dictionary:
                word_embeddings[word_index] = self.word_dictionary[
                    sentence[word_index]]

        return word_embeddings.tolist()

    @staticmethod
    def process_characters(sentence):

        number_words_in_sentence = len(sentence)
        processed_sentence = []

        punctuation_and_numbers = [
            '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', '*', '$', '&',
            '%', '@', '#', '^', '!', '0', '1', '2', '3', '4', '5', '6', '7',
            '8', '9'
        ]
        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ''
        ]
        punctuation_to_be_replaced = {'–': '-', '’': '\''}

        for word_index in range(number_words_in_sentence):

            list_of_word = list(sentence[word_index].lower())

            for char_index in range(len(list_of_word) - 1):

                if list_of_word[char_index] in punctuation_to_be_replaced:
                    list_of_word[char_index] = punctuation_to_be_replaced[
                        list_of_word[char_index]]

                if list_of_word[char_index] in alphabet and list_of_word[
                        char_index + 1] in punctuation_and_numbers:
                    list_of_word[char_index + 1] = ''
                elif list_of_word[
                        char_index] in punctuation_and_numbers and list_of_word[
                            char_index + 1] in alphabet:
                    list_of_word[char_index] = ''

            word = "".join(list_of_word)
            if word == '.' and sentence[word_index - 1] == '.':
                pass
            else:
                if word == '.......' or word == '....' or word == '.....' or word == '......' or word == '..':
                    word = '...'
                processed_sentence.append(word)
        return processed_sentence

    def lemmatize_and_pos_tagging(self, sentence, aspect_indices):

        punctuations = [
            '–', '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', '’', '*',
            '$', '&', '%', '@', '#', '^', '!', '\'', '-'
        ]

        parses = self.core_nlp_dependency_parser.parse(sentence)
        dependencies = [[(governor, dep, dependent)
                         for governor, dep, dependent in parse.triples()]
                        for parse in parses][0]

        wordnet_lemmatizer = nltk.WordNetLemmatizer()
        part_of_speech_sentence = list(range(len(sentence)))
        lemmatized_sentence = list(range(len(sentence)))
        sentiments = list(range(len(sentence)))
        aspects_dependencies = [['no'] * len(sentence)
                                for i in range(len(aspect_indices))]

        backup_sentence = sentence.copy()
        interesting_translates = {
            '-LRB-': '(',
            '-RRB-': ')',
            '2\xa01/2': '2 1/2',
            "''": '"',
            ':-RRB-': ':)'
        }

        sentence_negations = []

        for dependency in dependencies:

            words = [dependency[0][0], dependency[2][0]]
            part_of_speech = [dependency[0][1], dependency[2][1]]

            if words[0] in interesting_translates:
                words[0] = interesting_translates[words[0]]
            if words[1] in interesting_translates:
                words[1] = interesting_translates[words[1]]

            range_list = [0, 1]
            if words[0] in sentence:
                index_of_word1 = sentence.index(words[0])
                sentence[index_of_word1] = ''
            else:
                index_of_word1 = backup_sentence.index(words[0])
                range_list = [1]

            if words[1] in sentence:
                index_of_word2 = sentence.index(words[1])
                sentence[index_of_word2] = ''
            else:
                index_of_word2 = backup_sentence.index(words[1])
                range_list = [0]

            word_indices = [index_of_word1, index_of_word2]

            if dependency[1] == 'neg':
                sentence_negations.append(word_indices)

            for aspect_index in range(len(aspect_indices)):

                if index_of_word1 in aspect_indices[aspect_index] and index_of_word2 not in \
                        aspect_indices[aspect_index]:
                    aspects_dependencies[aspect_index][
                        index_of_word2] = dependency[1]
                elif index_of_word1 not in aspect_indices[aspect_index] and index_of_word2 in \
                        aspect_indices[aspect_index]:
                    aspects_dependencies[aspect_index][
                        index_of_word1] = dependency[1]
                elif index_of_word1 in aspect_indices[
                        aspect_index] and index_of_word2 in aspect_indices[
                            aspect_index]:
                    if aspects_dependencies[aspect_index][
                            index_of_word1] == 'no':
                        aspects_dependencies[aspect_index][
                            index_of_word1] = dependency[1]
                    else:
                        aspects_dependencies[aspect_index][
                            index_of_word2] = dependency[1]

            for i in range_list:

                if part_of_speech[i].startswith('V'):  # Verb
                    part_of_speech_sentence[word_indices[i]] = [1, 0, 0, 0, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.VERB)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.VERB)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                elif part_of_speech[i].startswith('J'):  # Adjective
                    part_of_speech_sentence[word_indices[i]] = [0, 1, 0, 0, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.ADJ)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.ADJ)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                elif part_of_speech[i].startswith('R'):  # Adverb
                    part_of_speech_sentence[word_indices[i]] = [0, 0, 1, 0, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.ADV)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.ADV)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                elif part_of_speech[i].startswith('N'):  # Noun
                    part_of_speech_sentence[word_indices[i]] = [0, 0, 0, 1, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.NOUN)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.NOUN)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                else:  # Otherwise
                    part_of_speech_sentence[word_indices[i]] = [0, 0, 0, 0, 1]
                    if words[i] not in punctuations:
                        words[i] = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(words[i])
                    sentiments[word_indices[i]] = [0, 0, 1]
                    lemmatized_sentence[word_indices[i]] = lemma.lower()

        return lemmatized_sentence, part_of_speech_sentence, aspects_dependencies, sentence_negations, sentiments

    @staticmethod
    def get_sentiment_of_word(word, lemma, pos):

        synsets = wordnet.synsets(word, pos=pos)

        if len(synsets) != 0:

            memorized_synset_01 = None
            check_boolean_01 = False

            memorized_synset_rest = None
            check_boolean_rest = False

            list_of_numbers = [
                '04', '02', '03', '05', '06', '07', '08', '09', '10', '11',
                '12'
            ]

            for synset in synsets:
                synset_split = synset.name().split(".")
                if synset_split[0] == lemma:
                    swn_synset = sentiwordnet.senti_synset(synset.name())
                    pos_score = swn_synset.pos_score()
                    neg_score = swn_synset.neg_score()

                    if pos_score > neg_score:
                        return [1, 0, 0]
                    elif neg_score > pos_score:
                        return [0, 1, 0]
                    else:
                        return [0, 0, 1]
                if synset_split[2] == '01' and not check_boolean_01:
                    memorized_synset_01 = synset
                    check_boolean_01 = True
                elif synset_split[
                        2] in list_of_numbers and not check_boolean_rest:
                    memorized_synset_rest = synset
                    check_boolean_rest = True
            if check_boolean_01:
                synset = memorized_synset_01
            else:
                synset = memorized_synset_rest

            swn_synset = sentiwordnet.senti_synset(synset.name())
            pos_score = swn_synset.pos_score()
            neg_score = swn_synset.neg_score()

            if pos_score > neg_score:
                return [1, 0, 0]
            elif neg_score > pos_score:
                return [0, 1, 0]
            else:
                return [0, 0, 1]
        return [0, 0, 1]

    def annotate(self, text, properties=None):
        assert isinstance(text, str)
        if properties is None:
            properties = {}
        else:
            assert isinstance(properties, dict)

        # Checks that the Stanford CoreNLP server is started.
        try:
            requests.get(self.server_url)
        except requests.exceptions.ConnectionError:
            raise Exception(
                'Check whether you have started the CoreNLP server e.g.\n'
                '$ cd stanford-corenlp-full-2018-02-27/ \n'
                '$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer'
            )
        data = text.encode()
        r = requests.post(self.server_url,
                          params={'properties': str(properties)},
                          data=data,
                          headers={'Connection': 'close'})
        output = r.text

        char_index1 = output.index("sentimentDistribution")
        char_index2 = output.index("sentimentTree")
        distribution = output[(char_index1 - 1):(char_index2 - 2)]

        new_distribution = []
        word = []

        for char_index in range(len(distribution)):

            if distribution[char_index].isnumeric():
                word.append(distribution[char_index])
            elif distribution[char_index] == ',' and len(word) == 1:
                word.append('.')
            elif (distribution[char_index] == ','
                  or distribution[char_index] == ']') and len(word) != 1:
                number = float("".join(word))
                new_distribution.append(number)
                word = []

        return new_distribution
Exemple #16
0
# Parse tokenized text.
print("\nParse tokenized text")
# print(list(parser.parse('What is the airspeed of an unladen swallow ?'.split())))
print(list(parser.parse(sentence.split())))
# [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

print("\nRaw string")
# Parse raw string.
print(list(parser.raw_parse(sentence)))
# [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

# Neural Dependency Parser
print("\nNeural Dependency Parser")
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(sentence.split())
# [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
# [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
print("\nTokenizer")
print(list(parser.tokenize(sentence)))
# ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']

# POS Tagger
print("\nPOS Tagger")
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
print(list(pos_tagger.tag(sentence.split())))
# [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]