Esempio n. 1
0
 def syntax_tree_parser(self):
     """
     get syntax tree
     :return: syntax tree
     """
     if self.syntax_tree is not None:
         return self.syntax_tree
     parser = CoreNLPParser(url='http://localhost:8999')
     self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0]
     return self.syntax_tree
def nltk_stuff(data):
    parser = CoreNLPParser(url='http://localhost:9000')
    num_nodes = 0
    num_s = 0
    height = 0
    subset = dict(random.sample(data.items(), 5000))
    get_lexicon_stats(subset)
    print("\n")
    for _, value in subset.items():
        parsed = list(parser.parse(value.split()))
        height += parsed[0].height()
        curr_num_nodes, curr_num_s = count_nodes(parsed[0][0], 0, 0)
        num_nodes += curr_num_nodes
        num_s += curr_num_s
    print("Num clauses: {}".format(num_s))
    print("Num nodes: {}".format(num_nodes))
    print("Total height: {}".format(height))
    print("Avg nodes per tweet: {}".format(num_nodes / len(subset.items())))
    print("Avg height of tweet: {}".format(height / len(subset.items())))
Esempio n. 3
0
        if len(children) != len(temp.leaves()):
            for j in children:
                if j != ' '.join(tree.label().split()[1:]) and (j not in [
                        i[1] for i in deplist
                ]):
                    deplist.append([' '.join(tree.label().split()[1:]), j])
    return deplist


def sorting(elem):
    return elem[1].split()[2]


parser = CoreNLPParser(url='http://localhost:9000')
#Read the sentence and form the parse tree
parser_output_splitted = parser.parse(sentence.split())

for j in parser_output_splitted:
    temp = j

#Part 1
parser_output = traverse_tree(temp)
print(parser_output)

#Part 2
precedence_list = [
    'VP', 'VB', 'VBG', 'VBZ', r'V*', 'NP', 'NN', r'N*', 'ADVP', 'JJ', 'P',
    'RB', 'PRP$', 'PRP'
]
deptree = dep_tree(parser_output)
print(deptree)
Esempio n. 4
0
def read_data():
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_sentences = []
    sentences = []
    for _, _, file in os.walk("../../data/parsing_corpus"):
        for filename in file:
            with open("../../data/parsing_corpus/" + filename, "r") as f:
                contents = f.read()
                contents = contents.split("\n")
                for i in range(len(contents)):
                    temp_tokenized_sentence = tokenizer.tokenize(contents[i])
                    if(len(temp_tokenized_sentence) <= 50):
                        tokenized_sentences.append(temp_tokenized_sentence)
                        sentences.append(contents[i])
    return tokenized_sentences, sentences

tokenized_sentences, sentences = read_data()

parser = CoreNLPParser(url = 'http://localhost:9000')
tree_sentences = []

for i in range(len(tokenized_sentences)):
    if(tokenized_sentences[i]):
        a = list(parser.parse(tokenized_sentences[i]))
        tree_sentences.append(a)
with open('./CFG trees.txt', 'w') as filehandle:
    for i in range(len(tree_sentences)):
        filehandle.write(sentences[i])
        filehandle.write("\n")
        filehandle.write(str(tree_sentences[i]))
        filehandle.write("\n\n")
Esempio n. 5
0
                if f"{token['text']}-{token['tag']}" not in all_words:
                    all_words.add(f"{token['text']}-{token['tag']}")
                    # If not then check all the synsets
                    pos = None
                    synsets = wn.synsets(token["text"], pos=pos)
                    for syn in synsets:
                        for lem in syn.lemmas():
                            # Print lemmas for all the synonyms
                            token_synonyms.append(lem.name())
                        for hyp in syn.hypernyms():
                            # Print all hypernyms
                            token_hypernyms.append(hyp.name())
                    lookup_data.append((token['text'], token['tag'], token_synonyms, token_hypernyms, synsets))
                                  
from nltk.parse import CoreNLPParser
import io
import time

#
# You have to start the server first using
#    $ bash ./start_server.sh" command
#
parser = CoreNLPParser(url='http://localhost:9069')

with open('wordnet_trees.txt', mode='a') as results_file:
    # Go through all pairs in the corpus
    for i, pair in enumerate(corpus):
        input_text = pair.text
        tokens_text = [word for word in nltk.word_tokenize(input_text) if word.isalnum()]
        for tree in parser.parse(tokens_text):
            tree.pretty_print(stream=results_file)
Esempio n. 6
0
def reorder_syntactic_tokenized_sentence_regex(logger=None,
                                               lst_sentence=None,
                                               use_stanford_parser=True,
                                               verbose=True):

    #### WARNING!!! needed that str_structure betwwen start_nest and end_nest to get a root-point in tree (as nested sting)
    def parse_nested_structure_with_regex(str_structure,
                                          logger=None,
                                          verbose=True,
                                          start_nest='[',
                                          end_nest=']'):

        if logger is not None:
            logger.info(
                ' ******** input nested structure : {}'.format(str_structure))

        lst_structure = list(str_structure)

        ####
        ######## STEP 1: detect positions of nest start-symbols and end-symbols
        #### ...we get the positions of start nest and end nest as boolean lists...
        lst_bool_start_nest_pos = [
            True if lst_structure[i_chr] == start_nest else False
            for i_chr in range(len(lst_structure))
        ]
        lst_bool_end_nest_pos = [
            True if lst_structure[i_chr] == end_nest else False
            for i_chr in range(len(lst_structure))
        ]

        df_nested_structure = pd.DataFrame({
            'symbol_start': [],
            'symbol_end': [],
            'layer_x': [],
            'layer_depth': [],
            'father_layer_x': [],
            'father_layer_depth': [],
            'pos_char_start': [],
            'pos_char_end': [],
            'string_fragment': []
        })

        ####
        ######## STEP 2: detect nested substructures
        #lst_ctrl = lst_structure
        #ctrl = True
        while True:
            #### ...initial positions of root-tree...
            current_layer_x = 0
            current_layer_depth = -1
            #### ...initial positions on lists of detected parenthesis...
            current_start_pos = 0
            current_end_pos = 0

            #### ...run on start parenthesis matched...
            for i_start in range(len(lst_bool_start_nest_pos)):
                #print('*** character {0} - position {1}'.format(lst_structure[i_start], i_start))

                if lst_bool_start_nest_pos[i_start]:
                    #print('*** start parenthesis found: {}'.format(lst_structure[i_start]))
                    current_start_pos = i_start
                    current_layer_depth = current_layer_depth + 1

                    for i_end in range(i_start + 1,
                                       len(lst_bool_start_nest_pos)):
                        #print('*** searching end parenthesis found: {}'.format(lst_structure[i_end]))
                        if lst_bool_start_nest_pos[i_end]:
                            break
                        if lst_bool_end_nest_pos[i_end]:
                            current_end_pos = i_end
                            break

                    if current_end_pos > 0:
                        if logger is not None and verbose:
                            logger.info('**** nested parenthesis found!! ****')
                            logger.info(
                                'start pos: {0} - end pos: {1} - expresion: {2}'
                                .format(
                                    current_start_pos, current_end_pos,
                                    ''.join(lst_structure[
                                        current_start_pos:current_end_pos +
                                        1])))
                            logger.info(
                                'layer_x: {0} - layer_depth: {1}'.format(
                                    current_layer_x, current_layer_depth))

                        #### ...define current_layer_x...
                        if len(df_nested_structure[
                                df_nested_structure['layer_depth'] ==
                                current_layer_depth]) > 0:
                            df_nested_x = df_nested_structure[
                                df_nested_structure['layer_depth'] ==
                                current_layer_depth]
                            #df_nested_x = df_nested_x.sort_values(by=['pos_char_start'])
                            current_layer_x = df_nested_x['layer_x'].max() + 1

                        #### ...add substructure found in output dataframe...
                        lst_row_to_append = [
                            start_nest, end_nest, current_layer_x,
                            current_layer_depth, 0, 0, current_start_pos,
                            current_end_pos, ''.join(lst_structure[
                                current_start_pos:current_end_pos + 1])
                        ]
                        row_to_append = pd.Series(
                            lst_row_to_append,
                            index=df_nested_structure.columns)
                        df_nested_structure = df_nested_structure.append(
                            row_to_append, ignore_index=True)

                        #### ...remove parenthesis found...
                        lst_bool_start_nest_pos[current_start_pos] = False
                        lst_bool_end_nest_pos[current_end_pos] = False
                        break

            #### ...if all parenthesis found ctrl = False and finish while loop...
            if True not in lst_bool_start_nest_pos or True not in lst_bool_end_nest_pos:
                if True not in lst_bool_start_nest_pos and True not in lst_bool_end_nest_pos:
                    #ctrl = False
                    break
                else:
                    if logger is not None:
                        logger.error(
                            ' - troubles with nested string: {0}'.format(
                                str_structure))
                    raise Exception

        ####
        ######## STEP 3: set father node/tree (setting its indexes layer_x and layer_depth)

        #### ...we define the substructures length; with this we can get which structure contains which...
        df_nested_structure['substring_length'] = df_nested_structure[
            'pos_char_end'] - df_nested_structure['pos_char_start']

        #### ...we get, for each nested-strucuture found, the structure shorter containining it...
        for i_nest, row in df_nested_structure.iterrows():
            pos_start = df_nested_structure.iloc[i_nest]['pos_char_start']
            pos_end = df_nested_structure.iloc[i_nest]['pos_char_end']

            df_nest_aux = df_nested_structure[
                (df_nested_structure['pos_char_start'] < pos_start)
                & (df_nested_structure['pos_char_end'] > pos_end)]
            df_nest_aux = df_nest_aux[df_nest_aux['substring_length'] ==
                                      df_nest_aux['substring_length'].min()]

            if len(df_nest_aux) > 0:
                df_nested_structure.loc[
                    i_nest, 'father_layer_x'] = df_nest_aux.iloc[0]['layer_x']
                df_nested_structure.loc[
                    i_nest,
                    'father_layer_depth'] = df_nest_aux.iloc[0]['layer_depth']

        df_nested_structure = df_nested_structure.drop('substring_length',
                                                       axis=1)

        return df_nested_structure

    try:
        #### ...to control words that we used...
        lst_sentence_ctrl = list(lst_sentence)
        #### ...to store output...
        sentence_reordered = list(lst_sentence)

        parser = None
        if use_stanford_parser:
            parser = CoreNLPParser(url='http://localhost:9000')
        else:
            if logger is not None:
                logger.warn('we need you up standford nlp parser')

        ####
        ####
        lst_root_trees = []
        for tree in parser.parse(sentence_reordered):
            lst_root_trees.append(tree)
        #### ...if would have a list of trees, we would pick the first...
        root_tree = lst_root_trees[0]
        print(root_tree)

        #### ...we apply parsing...
        p = list(parser.parse(sentence_reordered))
        #### ...just in case.... as string to treat with regex...
        str_tree = str(p)
        df_tree_structure = parse_nested_structure_with_regex(str_tree,
                                                              logger=logger,
                                                              verbose=True)

        print(df_tree_structure)

        if logger is not None:
            logger.info('reordered sentence: '.format(sentence_reordered))

        #### ...warning if for some trouble, len(input sentence) != len(output sentence)
        if logger is not None and len(lst_sentence) != len(sentence_reordered):
            logger.warn('input and output sentences with different length')

    except Exception as e:
        if logger is not None:
            logger.exception(
                'ERROR reordering phrase: {}'.format(lst_sentence))
        raise e

    return sentence_reordered
Esempio n. 7
0
from nltk.parse import CoreNLPParser
from nltk.corpus import treebank
from printModule import *

sentence1 = "Patient with HGM value greater than 55 g/L"
sentence = "When did princes Diana die?"

# Lexical Parser
parser = CoreNLPParser(url='http://localhost:9000')

# Parse tokenized text.
print("\nParse tokenized text")
# print(list(parser.parse('What is the airspeed of an unladen swallow ?'.split())))
print(list(parser.parse(sentence.split())))
# [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

print("\nRaw string")
# Parse raw string.
print(list(parser.raw_parse(sentence)))
# [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

# Neural Dependency Parser
print("\nNeural Dependency Parser")
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(sentence.split())
# [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
# [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
Esempio n. 8
0
from nltk.parse import CoreNLPParser
from nltk.tree import *

parser = CoreNLPParser(url='http://localhost:9000')
ls = list(
    parser.parse("Alfonso XIII of Spain's birth place is Madrid.".split()))
ls = ParentedTree.fromstring(str(ls[0]))


def getSubject(t):
    for s in t.subtrees(lambda t: t.label() == 'NP'):
        for n in s.subtrees(lambda n: n.label().startswith('NN')):
            return (n[0], getAttributes(n))


def getPredicate(t):
    v = None

    for s in t.subtrees(lambda t: t.label() == 'VP'):
        for n in s.subtrees(lambda n: n.label().startswith('VB')):
            v = n
            return (v[0], getAttributes(v))


def getObject(t):
    for s in t.subtrees(lambda t: t.label() == 'VP'):
        for n in s.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']):
            if n.label() in ['NP', 'PP']:
                for c in n.subtrees(lambda c: c.label().startswith('NN')):
                    return (c[0], getAttributes(c))
            else:
Esempio n. 9
0
from nltk.parse import CoreNLPParser
from nltk.tree import Tree

parser = CoreNLPParser(url='http://localhost:9000')
f = open("../Fragments_for_testing/text2", "r")
sentence = f.read()
list_tree = str(list(parser.parse(sentence.split())))
list_tree = list_tree.replace('Tree','')
list_tree = list_tree.replace('\'','')
list_tree = list_tree.replace(',','')
list_tree = list_tree.replace('[','')
list_tree = list_tree.replace(']','')
list_tree = list_tree.replace('(. .)','')
list_tree = list_tree.replace('(. !)','')
list_tree = list_tree.replace('ROOT','S1')
tree = Tree.fromstring(list_tree)
tree.draw()
Esempio n. 10
0
from nltk import word_tokenize, sent_tokenize, bigrams, trigrams
from nltk.tag import StanfordPOSTagger
from nltk.parse import CoreNLPParser
from docx import Document
#cd Documents\Repos\investigacion\source\


#Configuracion inicial del POSTagger de Stanford (verificar que la ruta sea la correcta, sino no funciona)
tagger = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\models\spanish.tagger'
parser = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-corenlp\spanishPCFG.ser.gz'
jar1 = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\stanford-postagger.jar'
jar2 = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-corenlp\stanford-corenlp-3.9.1.jar'
etiquetador = StanfordPOSTagger(tagger,jar1)
parseador = CoreNLPParser(url='http://localhost:9000')
print(list(parseador.parse('El viejo hombre se sentó solo sobre la montaña a observar el horizonte'.split())))
Esempio n. 11
0
import nltk
from nltk.parse import CoreNLPParser

# Creates object for CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')

# Reading sentences from input file
file = open('../data//input.txt', 'r')
sentences = [line.strip().split() for line in file.readlines()]
file.close()

# Creating output file to write result
file = open('../output.txt', 'w')

with file as out:
    for sent in sentences:
        tokens = sent
        parsetree = list(parser.parse(tokens))
        file.write(" ".join(sent) + "\n\n")
        file.write(str(parsetree[0]))
        file.write(
            "\n\n=================================================================================\n\n"
        )
file.close()
Esempio n. 12
0
def reorder_syntactic_tokenized_sentence(logger=None,
                                         lst_sentence=None,
                                         use_stanford_parser=True,
                                         verbose=True):

    try:

        #### ...to control words that we used...
        lst_sentence_ctrl = list(lst_sentence)
        #### ...to store output...
        sentence_reordered = list(lst_sentence)

        parser = None
        if use_stanford_parser:
            parser = CoreNLPParser(url='http://localhost:9000')
        else:
            if logger is not None:
                logger.warn('we need you up standford nlp parser')

        #### ...we apply parsing...
        #p = list(parser.parse(sentence_reordered))
        #### ...just in case.... as string to treat with regex...
        #str_tree = str(p)
        #print(str_tree)

        #### ...we get the first syntactic root tree (just in case we have several trees)...
        lst_root_trees = []
        for tree in parser.parse(sentence_reordered):
            lst_root_trees.append(tree)
            #print(tree[0])
            #print(len(tree[0]))
            #print(tree[0,0])

        #### ...if would have a list of trees, we would pick the first...
        root_tree = lst_root_trees[0]

        #### ...we get all subtrees and load in a list...
        lst_subtrees = []
        for subtree in root_tree.subtrees():
            lst_subtrees.append(subtree)

        #### ...selecting leaves from bottom to up (aproximately)...
        #### ...trees ..are run and loaded from up to bottom... so, we reverse the list...
        lst_subtrees.reverse()
        sentence_reordered.clear()

        #### ...in while loop we take leaves from each tree and remove from lst_sentence (input)
        #### ...loop stop when we have runed all trees or we already take all words in original phrase
        if logger is not None:
            logger.info(
                'syntactic reorder in input phrase: {}'.format(lst_sentence))

        i = 0
        while i < len(lst_subtrees):
            subtree = lst_subtrees[i]

            leaves = subtree.leaves()
            label = subtree.label()

            for leave in leaves:
                if leave in lst_sentence_ctrl:
                    sentence_reordered = sentence_reordered + [leave]
                    #### ...delete used word from lst_sentence_ctrl
                    lst_sentence_ctrl.remove(leave)

            if logger is not None and verbose:
                logger.info('subtree {} from {}'.format(
                    i + 1, len(lst_subtrees)))
                logger.info(subtree)
                logger.info('tree label: {}'.format(label))
                logger.info('input phrase: {}'.format(lst_sentence))
                logger.info('tree leaves: {}'.format(leaves))
                logger.info('reordered words: {}'.format(sentence_reordered))
                logger.info(
                    'remaining words to order: {}'.format(lst_sentence_ctrl))

            i = i + 1

            #### ...end loop if lst_sentence_ctrl already is empty (all words used)
            if len(lst_sentence_ctrl) == 0:
                break

        if logger is not None:
            logger.info('reordered sentence: '.format(sentence_reordered))

        #### ...warning if for some trouble, len(input sentence) != len(output sentence)
        if logger is not None and len(lst_sentence) != len(sentence_reordered):
            logger.warn('input and output sentences with different length')

    except Exception as e:
        if logger is not None:
            logger.exception(
                'ERROR reordering phrase: {}'.format(lst_sentence))
        raise e

    return sentence_reordered
Esempio n. 13
0
                o_tree =o_tree[0]
            res=res+o_tree.label()+'('+' '.join(subtree[i].leaves())+')'
        else:
            res=res+subtree[i].label()+'('+' '.join(subtree[i].leaves())+')'
    return res
    #
    # if type(subtree) == ParentedTree and len(subtree) > 1:
    #     return subtree
    # else:
    #     return None

if __name__ == '__main__':
    res=[]
    parser = CoreNLPParser(url='http://localhost:9000')
    # dataset=pd.read_csv("Result.csv",sep=',',header=None)
    sentence = [bala for bala in parser.parse("right with something thing white in his hand".split())]
    sentence = ParentedTree.convert(sentence[0])
    print(sentence)
    # res.append(result(sentence))
    # tree = parser.parse("old lady with glasses holding teddy bear")
    # sentence = ParentedTree.convert(tree)
    # print(result(sentence))

    # for i in range(0,dataset.shape[0]):
    # # for i in range(0,1):
    #     sentence = [bala for bala in parser.parse(dataset[1][i].split())]
    #     sentence = ParentedTree.convert(sentence[0])
    #     res.append(result(sentence))
    #     print(i)
    # dataset[3] = res
    # print(dataset)
Esempio n. 14
0
class custom_parse_handler:
    corenlp_host = 'http://localhost:9000'  # CoreNLP server host
    main_categories = ['geography', 'music',
                       'movies']  # Categories utilized in the project

    def __init__(self, input_file, output_file, dbConnector):
        self.ip_file = input_file  # Input statements
        self.op_file = output_file  # Generated output streamed to this file apart from the command prompt

        self.parser = CoreNLPParser(
            url=self.corenlp_host
        )  # Initializing the connection with CoreNLP parse
        self.testParserConnection()

        # Setting up the word2vec model
        corpusFilePath = os.path.dirname(os.path.realpath(
            __file__)) + os.path.sep + "tools" + os.path.sep + "word2vec"
        corpusFileName = "GoogleNews-vectors-negative300.bin"
        self.filePath = corpusFilePath  # file path
        self.fileName = corpusFilePath + os.path.sep + corpusFileName  # Constructing the full path for the file name
        self.model = KeyedVectors.load_word2vec_format(self.fileName,
                                                       binary=True)

        self.stopWords = nltk.corpus.stopwords.words('english')
        self.fileNewLine = "\n"

        self.dbConnector = dbConnector

    def testParserConnection(self):
        str = "This is a test statement"
        try:
            list(self.parser.parse(str.split()))
        except Exception as e:
            print("Error while connecting to CoreNLP server. Exiting.")
            sys.exit()

    def getParseTree(self, sentence):
        return list(self.parser.parse(sentence.split()))

    def displayConstructedParseTree(self, parseTree, fileObj=None):
        for entry in parseTree:
            if fileObj is None:
                entry.pretty_print()
            else:
                entry.pretty_print(stream=fileObj)

    def updatePredictedCategoryForWord(self, entry, category_sum):
        for i in range(len(self.main_categories)):
            try:
                sim_val = self.model.similarity(entry, self.main_categories[i])
                category_sum[self.main_categories[i]] += sim_val
            except KeyError:
                pass
        return category_sum

    def getCategoryWithMaxVoting(self, categoryMap):
        max_val = None
        max_category = None
        for entry in categoryMap:
            val = categoryMap[entry]
            if max_val is None or val > max_val:
                max_val = val
                max_category = entry
        return max_category

    def assignCategory(self, statement):
        # Special case: Statements associated with Geography are misclassified when beginning with "where is"
        if statement.lower().startswith('where is'):
            return 'geography'
        tokens = list(self.parser.tokenize(statement))
        filtered_words = [w for w in tokens if not w in self.stopWords]
        filtered_words_lower = [w.lower() for w in filtered_words]
        category_sum = {}
        for entry in self.main_categories:
            # Exclude the category 'geography' if the words related to 'birth' are present in the sentence
            if (entry != 'geography' and 'capital' not in filtered_words_lower
                ) or (entry == 'geography'
                      and not ('born' in filtered_words_lower
                               or 'birth' in filtered_words_lower)):
                category_sum[entry] = 0
        for entry in filtered_words:
            category_sum = self.updatePredictedCategoryForWord(
                entry, category_sum)
        return self.getCategoryWithMaxVoting(category_sum)

    # Direct the output to the default output stream and an output file.
    def outputGenerator(self, statement, query, answer, opFileObj=None):
        if opFileObj is not None:
            opFileObj.write("<QUESTION> " + statement)
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
            if query is not None:
                opFileObj.write("<QUERY> " + query)
            else:
                opFileObj.write("<QUERY> ")
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
            opFileObj.write("<ANSWER> " + answer)
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
        print("<QUESTION> ", statement, "\n")
        if query is not None:
            print("<QUERY> ", query, "\n")
        else:
            print("<QUERY>\n")
        print("<ANSWER> ", answer, "\n\n")

    # Process the parse tree to extract projections and perform translation into SQL queries
    def extractProjections(self, parseTree, queryObj, category):
        # Starts off the entire tree recursion process
        for entry in parseTree:
            # entry.pretty_print()
            self.processRecurse(None, entry, queryObj, category)

    # Recursively traverse the parse tree using DFS and generate transitions for each parent, child node-pair
    def processRecurse(self, parent, treeObj, queryObj, category):
        if type(treeObj) != nltk.tree.Tree:  # leaf node
            transition_obj = transition(parent, treeObj, None, queryObj,
                                        category)
            return treeObj, transition_obj
        if "." == treeObj.label() or "DT" == treeObj.label(
        ):  # do not handle determiners or punctuation
            return "", None
        str_transition = treeObj.label() + " " + "->"
        current_children = []
        for i in range(len(treeObj)):
            label, transition_obj_inter = self.processRecurse(
                treeObj.label(), treeObj[i], queryObj, category)
            if transition_obj_inter is not None:
                current_children.append(transition_obj_inter)
                str_transition += " " + label
        if treeObj.label() != 'ROOT':
            transition_obj_fin = transition(parent, str_transition,
                                            current_children, queryObj,
                                            category)
        else:  # Root of the tree is encountered
            transition_obj_fin = None
        return treeObj.label(), transition_obj_fin

    def parseInputFile(
        self
    ):  # Parse the statements in the input file sequentially and perform semantic tranformation
        ipFileObj = open(self.ip_file, "r")
        opFileObj = open(self.op_file, "w")
        try:
            for entry in ipFileObj:
                question = entry.strip()
                if not question.startswith('--'):
                    queryObj = queryForm()
                    parseTree = self.getParseTree(
                        question)  # Generate parse tree
                    category = self.assignCategory(
                        question)  # Assign probable category
                    self.extractProjections(
                        parseTree, queryObj, category
                    )  # Extract the projections and generate the query object
                    # queryObj.printComponents()
                    queryObj.constructQuery()  # Construct the final query
                    results = self.dbConnector.getResults(
                        queryObj, category
                    )  # Execute the query in the database and generate results
                    self.outputGenerator(question, queryObj.getQueryStr(),
                                         results, opFileObj)
        except Exception as e:
            print("Error while processing.")
            print(e)
        finally:
            ipFileObj.close()
            opFileObj.close()
Esempio n. 15
0
 def __init__(self, text):
     self.text = text
     self.tokens = nltk.word_tokenize(text)
     parser = CoreNLPParser(url='http://localhost:8999')
     self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0]
# TODO make this test throw messages saying if the output is correct or not.

from nltk.parse import CoreNLPParser

# Lexical Parser
parser = CoreNLPParser(url='http://localhost:9000')

# Parse tokenized text.
print(
    list(parser.parse('What is the airspeed of an unladen swallow ?'.split())))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Parse raw string.
print(list(parser.raw_parse('What is the airspeed of an unladen swallow ?')))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Neural Dependency Parser
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(
    'What is the airspeed of an unladen swallow ?'.split())
print([[(governor, dep, dependent)
        for governor, dep, dependent in parse.triples()] for parse in parses])
print(
    "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n"
)
Esempio n. 17
0
    return len(rhs) == 1 and isinstance(rhs[0], str)


parser = CoreNLPParser(url="http://localhost:9000")

sentences = brown.sents()

# FILTER SHORT AND LONG SENTENCES
filter_sentences = []
for sentence in tqdm(sentences):
    nb_words = number_of_words(sentence)
    if nb_words >= 5 and nb_words <= 10:
        filter_sentences.append(sentence)

# PARSE SENTENCES
productions = []
for sentence in tqdm(filter_sentences):
    parse_tree = next(iter(parser.parse(sentence)))
    productions += parse_tree.productions()

unique_productions = list(set(productions))

# REMOVE TERMINAL SYMBOLS
productions_wo_term = []
for prod in unique_productions:
    if not is_rhs_terminal(prod):
        productions_wo_term.append(prod)

grammar = CFG(start=Nonterminal("ROOT"), productions=productions_wo_term)
pickle.dump(grammar, open("brown_grammar.pickle", "wb"))
Esempio n. 18
0
            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities


# run this you have to connect to api
# go to dir - stanford-corenlp-full-2018-02-27
# the two lines below type in terminal as one line
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
# -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 &

from nltk.parse import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')
list(parser.parse(doc))  # for sentence tokenized doc
list(parser.raw_parse(doc))  # for non tokenized docs

# on tokenized list of words
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
list(pos_tagger.tag(doc))

ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
list(ner_tagger.tag(doc))

from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
list(dep_parser.parse(doc))
Esempio n. 19
0
        senid += 1
        # stopline = 1899
        # if senid != stopline:
        #     continue
        # senid = stopline
        fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase)
        alignment = do_alignment(fe_alignment, ef_alignment, len(ef_phrase[0]),
                                 len(fe_phrase[0]))
        # fe_phrase = fe_phrases[id]
        # ef_phrase = ef_phrases[id]
        BP, BP_pos = phrase_extraction(fe_phrase[0], ef_phrase[0],
                                       alignment)  # fe_phrase[0] 是 e 句子
        f_sen = ' '.join(ef_phrase[0])

        try:
            p_parse_trees = list(parser.parse(parser.tokenize(f_sen)))
        except ValueError:
            print('parsing fail')
            exception_sen.append(senid)
            p_parse_trees = [Tree.fromstring('(S (NULL ERROR))')
                             ]  # we simply give a dummy tree

        # create a dict to keep all phrase in different categories
        p_phrase_dict = {}
        for tag in phrase_tag:
            p_phrase_dict[tag] = []

        for one_tree in p_parse_trees:
            # print(one_tree)
            traverse(one_tree, p_phrase_dict, phrase_tag)
    parser = CoreNLPParser(url='http://localhost:9001')
    print('parser generated!')
    exception_sen = []
    tree_list = []
    p_phrase_trees = None
    f_input = open(args.input, mode='r', encoding='utf-8')

    f_output = open(args.output, mode='w', encoding='utf-8')
    # f_output = open(args.output, 'wt')
    for senid, line in enumerate(f_input):
        print(senid)
        # if senid > 100:
        #     break
        try:
            p_parse_trees = list(parser.parse(parser.tokenize(line)))
        except ValueError:
            print('parsing fail')
            exception_sen.append(senid)
            p_parse_trees = [Tree.fromstring('(S (NULL ERROR))')
                             ]  # we simply give a dummy tree
        f_output.write('%d\n' % len(p_parse_trees))
        for sub_tree in p_parse_trees:
            f_output.write(str(sub_tree))
            f_output.write('\n|||\n')
        # str_tree = ' '.join(p_parse_trees)
        # f_output.write(str_tree)
        # f_output.write('\n')

        # tree_list.append(p_parse_trees)
    f_input.close()
Esempio n. 21
0
		for sentence in sentences:
    		l += len(tokenize(sentence))
    	print("part:{}, label:{}, mean token count: {}".format(part, label, str(l/len(df))))

    	l = set()
		for i in premise:
    		l |= set(tokenize(sentence))

for part in parts:
	if part == "all":
		continue
	else:
		sentences = data[part].to_list()
    f = open("{}_{}".format(data_file, part),"a")
    for sentence in sentences:
    p = list(parser.parse(sentence.split()))
    for w in p:
        f.write(' '.join(str(w).split()) )
    f.write("\n")
hypothesis = []
f = open("{}_hypothesis".format(data_file), 'r')
for i in f:
    hypothesis.append(i)
premise = []
f = open("{}_premise".format(data_file), 'r')
for i in f:
    premise.append(i)
df = pd.DataFrame([],index=list(tags_to_results.keys()))
tags_to_results = defaultdict(list)
def log(tag, is_correct, label):
    tags_to_results[tag].append((is_correct, label))