def init_server(self):
     STANDFORD = os.path.join("stanford-corenlp-full-2018-10-05")
     self.server = CoreNLPServer(
         os.path.join(STANDFORD, "stanford-corenlp-3.9.2.jar"),
         os.path.join(STANDFORD, "stanford-corenlp-3.9.2-models.jar"))
     self.server.start()
     self.parser = CoreNLPParser()
Ejemplo n.º 2
0
def pos_tagger_lemma(document, listterms):
    print('Definición por pos tagger y lemma, busqueda de 3,2 y 1 gram')
    text = str()
    definiendums = list()
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    for i in document:
        if (len(i) > 1):
            tag = pos_tagger.tag(i.split(' '))
            for t in tag:
                if (t[1] == 'VERB'):
                    doc = nlp(t[0])
                    for tok in doc:
                        l = tok.lemma_
                        if (l == 'ser'):
                            text = i
                            indverb = i.index(t[0])
                            front = i[indverb:]
                            back = i[:indverb + len(t[0]) + 1]
                            tagfront = pos_tagger.tag(front.split(' '))
                            tagback = pos_tagger.tag(back.split(' '))
                            definiendum_definition(t[0], text, listterms)

                elif (t[1] == 'NOUN' and t[0] != '=RRB='):
                    text = i
                    if (len(t[0]) > 1):
                        #definiendum_definition(t[0], text, listterms)
                        pass

    return (text)
Ejemplo n.º 3
0
def _parseSentences(sentences, parseCacheFile):
    def cacheKey(text):
        return text.strip().encode('utf-8')

    cache = shelve.open(parseCacheFile)

    toParse = []
    for sentence in sentences:
        if cacheKey(sentence) not in cache:
            toParse.append(sentence)

    if toParse:
        p = Pool(10)

        parser = CoreNLPParser(
            url=os.getenv("CORENLP_HOST", "http://localhost:9000"))

        parseIterator = p.imap(lambda s: parser.parse_one(s.split()), toParse)

        progress = ProgressBar(len(toParse))
        for i, parse in enumerate(parseIterator):
            cache[cacheKey(toParse[i])] = parse
            progress.done(i)
        progress.complete()

    parses = map(lambda s: cache[cacheKey(s)], sentences)
    cache.close()

    return parses
def check_triples_by_pos(triples):
    pos_tagger = CoreNLPParser(url='http://39.98.186.125:9000', tagtype='pos')
    ret_triples = []
    for triple in triples:
        source = triple[0]
        relation = triple[1]
        target = triple[2]
        source_pos = ",".join(
            [e[1] for e in pos_tagger.tag(source.split(" "))])
        relation_pos = ",".join(
            [e[1] for e in pos_tagger.tag(relation.split(" "))])
        target_pos = ",".join(
            [e[1] for e in pos_tagger.tag(target.split(" "))])

        if "VB" in source_pos or "VB" in target_pos:
            continue
        if "NN" not in source_pos or "NN" not in target_pos:
            continue
        if "NN" in relation_pos:
            if " at" in relation.lower():
                relation = "at"
            elif "of" not in relation.split(" ") and len(
                    relation.split(" ")) > 1:
                continue

        ret_triples.append([source, relation, target])

    return ret_triples
Ejemplo n.º 5
0
def PVD(document):
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    sentence = sent_tokenize(document)
    word = 'se'
    lemma = 'definir'
    for i in range(len(sentence)):
        #print('--------------------')
        pattern = list()
        postaglist = list()
        tokens = nltk.word_tokenize(sentence[i])
        tag = pos_tagger.tag(tokens)
        for t in tag:
            if ('se' in tokens):
                pos = tokens.index('se')
                front = tokens[pos + 1:pos + 2]
                tag = pos_tagger.tag(front)

                doc = nlp(t[0])
                lemlist = [tok.lemma_ for tok in doc]
                #lem=''.join(lemlist)
                #lemmas_list.append(lem)
                #print(lemma, '-', lemlist)
                if ('definir' in lemlist or 'entender' in lemlist
                        or 'denominar' in lemlist):
                    #print(sentence[i])
                    front = tokens[pos + 2:pos + 5]
            if (t[1] == 'PUNCT'):
                pos = tokens.index(t[0])
                print(t[0], pos, tag[pos + 1])
            '''if(t[1]=='AUX'):
Ejemplo n.º 6
0
def dependency_parse(raw_data):
    from nltk.parse.corenlp import CoreNLPServer

    # The server needs to know the location of the following files:
    #   - stanford-corenlp-X.X.X.jar
    #   - stanford-corenlp-X.X.X-models.jar
    STANFORD = os.path.join("..", "stanford-corenlp-full-2020-04-20")

    # Create the server
    server = CoreNLPServer(
        os.path.join(STANFORD, "stanford-corenlp-4.0.0.jar"),
        os.path.join(STANFORD, "stanford-corenlp-4.0.0-models.jar"),
    )

    # Start the server in the background
    server.start()
    from nltk.parse import CoreNLPParser
    parser = CoreNLPParser()

    new_data = []
    for example in raw_data:
        sentence, features_seq = example[0], example[-1]
        parse = next(parser.raw_parse(sentence))
        # get a few "important" neighboring words

    server.stop()
def pos_tag_text(text):
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    #print("ORIGINAL TEXT ---------------",text)
    #tagged_text = tag(text)
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    tagged_text = pos_tagger.tag(text.split())

    #tagged_text = nltk.pos_tag(text)

    #print("TAGGED TEXT ---------",tagged_text)

    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in tagged_text]

    return tagged_lower_text
Ejemplo n.º 8
0
    def __init__(self, w2v_path, corpus_dict_path, port=9000):
        # corenlp client
        self.parser = CoreNLPParser(url='http://localhost:' + str(port))
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' +
                                                  str(port))
        # w2v
        self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
            'data/saved_models/GoogleNews-vectors-negative300.bin',
            binary=True)
        print('w2v model loaded')
        # training corpus for one hot features
        corpus_dict = pickle.load(open(corpus_dict_path, 'rb'))

        self.dep_tuple_vectorizer = DictVectorizer(sparse=False)
        self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit(
            corpus_dict['dep_tuple'])

        self.unigram_vectorizer = DictVectorizer(sparse=False)
        self.unigram_vectorizer = self.unigram_vectorizer.fit(
            corpus_dict['unigram'])

        self.bigram_vectorizer = DictVectorizer(sparse=False)
        self.bigram_vectorizer = self.bigram_vectorizer.fit(
            corpus_dict['bigram'])

        self.trigram_vectorizer = DictVectorizer(sparse=False)
        self.trigram_vectorizer = self.trigram_vectorizer.fit(
            corpus_dict['trigram'])

        self.lexical_vectorizer = DictVectorizer(sparse=False)
        self.lexical_vectorizer = self.lexical_vectorizer.fit(
            corpus_dict['lexical'])
Ejemplo n.º 9
0
def parseSentenceStructure(data):

    #Tokenize sent.
    tokens = nltk.word_tokenize(data)

    #Tag sent.
    tagged = nltk.pos_tag(tokens)

    #Parser
    parser = CoreNLPParser(
        url='http://localhost:9000'
    )  #https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk/51981566#51981566
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    #Parse w/ Stanford
    tree = parser.raw_parse(data)
    #print(list(tree))

    #list(tree)[0].pretty_print()
    #print(list(tree))

    #Provide N-V-N relationships w/ all N combinations

    #Traverse for NP root
    tree_recurse_find(list(tree)[0])
Ejemplo n.º 10
0
 def __init__(self, config):
     self.ontology_tagging = OntologyTagging()
     self.config = config
     self.word_dictionary = self.compute_all_embeddings()
     self.server_url = 'http://localhost:9000'
     self.parser = CoreNLPParser(url=self.server_url)
     self.core_nlp_dependency_parser = CoreNLPDependencyParser(
         url=self.server_url)
Ejemplo n.º 11
0
def pruebas():
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    tag = pos_tagger.tag(
        'tengo que ir Por el contrato de compra y venta uno de los contratantes se obliga a entregar una cosa determinada y el otro a pagar por ella un precio cierto, en dinero o signo que lo represente'
        .split(' '))
    doc = nlp('considerará')
    lemlist = [tok.lemma_ for tok in doc]
    print(lemlist)
def scorer(title, speaker):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    pos_tag_list = list(pos_tagger.tag(title[0].split()))
    s = 0
    for i in pos_tag_list:
        if 'NN' in i[1] or 'NP' in i[1]:
            s += 1
    return -s + abs(title[1] - speaker[1]) * 0.3
Ejemplo n.º 13
0
 def __init__(self,
              host='services.loadbalancer.api.questo.ai',
              port=9000,
              separator='|'):
     # self.parser = CoreNLPParser(url=f'http://{host}:{port}')
     self.parser = CoreNLPParser(
         url='http://services.loadbalancer.api.questo.ai:9000')
     self.separator = separator
Ejemplo n.º 14
0
 def clear_data(self):
     self.parser = CoreNLPParser(url='http://localhost:9000')
     self.first_NP = ''
     self.first_VP = ''
     self.parse_tree = None
     self.subject = RDF_Triple.RDF_SOP('subject')
     self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB')
     self.Object = RDF_Triple.RDF_SOP('object')      
Ejemplo n.º 15
0
def parse(text):
    parser = CoreNLPParser("http://localhost:9000")
    result = parser.raw_parse(text.lower())
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees
Ejemplo n.º 16
0
Archivo: tree.py Proyecto: tTeha/MRMARS
def parse(text):
    parser = CoreNLPParser(CORENLP_SERVER)
    result = parser.raw_parse(text)
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees
def verb_stats(data):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    verb_count = 0
    for _, value in data.items():
        pos = list(pos_tagger.tag(value.split()))
        for _, second in pos:
            if second.startswith("V"):
                verb_count += 1
    print(verb_count)
Ejemplo n.º 18
0
 def syntax_tree_parser(self):
     """
     get syntax tree
     :return: syntax tree
     """
     if self.syntax_tree is not None:
         return self.syntax_tree
     parser = CoreNLPParser(url='http://localhost:8999')
     self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0]
     return self.syntax_tree
def get_probable_title(titles):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    score = []
    for title in titles:
        pos_tag_list = list(pos_tagger.tag(title[0].split()))
        s = 0
        for i in pos_tag_list:
            if 'NN' in i[1] or 'NP' in i[1]:
                s += 1
        score.append((s, len(title[0]), title[0], title[1]))
    return max(score)
Ejemplo n.º 20
0
 def __init__(self):
     """
     Initialize the SVO Methods
     """
     self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"]
     self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
     self.adjective_types = ["JJ", "JJR", "JJS"]
     self.pred_verb_phrase_siblings = None
     jar = r'D:\data'
     model = r'your_path/stanford-postagger-full-2016-10-31/models/english-left3words-distsim.tagger'
     self.parser = CoreNLPParser(url='http://localhost:9000')
     self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
Ejemplo n.º 21
0
def get_speaker_salutaion(text, persons, speakers):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    text = text.split()
    pos_tag_list = list(pos_tagger.tag(text))
    for i in range(len(pos_tag_list) - 1):
        if pos_tag_list[i][1] == 'NNP' and pos_tag_list[i][0] not in persons:
            if pos_tag_list[i + 1] in persons:
                for speak in speakers:
                    if pos_tag_list[i + 1] in speak:
                        speak = pos_tag_list[i] + ' ' + speak
                        break
    return speakers
Ejemplo n.º 22
0
def get_stanford_pos_tags(line):
    """
    Get part of speech tags using the Stanford POS tagger
    """

    st_pos = CoreNLPParser(url="http://localhost:9000", tagtype="pos")
    tokenized_line = cnf.TOKENIZER.tokenize(line)
    line_tagged_initial = st_pos.tag(tokenized_line)
    line_tagged_output = []

    for item in line_tagged_initial:
        line_tagged_output.append((item[0], item[1]))

    return line_tagged_output
Ejemplo n.º 23
0
 def createGrammar(self, userMessages, ctx):
     parser = CoreNLPParser(url='http://localhost:9000')
     parse_trees = []
     for message in userMessages:
         tokenized = nltk.sent_tokenize(message)
         for sentence in tokenized:
             parse_trees.append(list(parser.raw_parse(sentence))[0])
     grammar_rules = set()
     for tree in parse_trees:
         for production in tree.productions():
             grammar_rules.add(production)
     start = nltk.Nonterminal('S')
     grammar = nltk.induce_pcfg(start, grammar_rules)
     return (' '.join((self.generate_sentence(grammar))))
Ejemplo n.º 24
0
def get_entities(tweets):
    entities = []
    for atweet in tweets:
        for sent in atweet:
            pos_tagger = CoreNLPParser(url='http://localhost:9000',
                                       tagtype='pos')
            sent = list(pos_tagger.tag(normalize(sent)))
            #             sent = pos_tag(normalize(sent))
            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities
Ejemplo n.º 25
0
def sfNERTagger(rawText):
	'''(sf = stanford) get the raw text from a file and convert that to a list with tuples of each word with a StanFord annotated NER-tag'''
	parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
	tupleList = list(parser.tag(rawText.split()))
	#convert list of tuple to list of lists, so we can change tags we dont need
	NERList = [list(tuple) for tuple in tupleList]

	#change tags we dont need
	for item in NERList:
		if item[1] == 'COUNTRY': item[1] = 'COU'
		elif item[1] == 'PERSON': item[1] = 'PER'
		elif item[1] == 'CITY': item[1] = 'CIT'
		elif item[1] == 'ORGANIZATION': item[1] = 'ORG'
		else: item[1] = ''

	return NERList
class PosTaggingCoreNLP(PosTagging):
    """
    Concrete class of PosTagging using a CoreNLP server 
    Provides a faster way to process several documents using since it doesn't require to load the model each time.
    """

    def __init__(self, host='localhost', port=9000, separator='|'):
        self.parser = CoreNLPParser(url=f'http://{host}:{port}')
        self.separator = separator

    def pos_tag_raw_text(self, text, as_tuple_list=True):
        # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp
        # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split)
        # We create a small custom function highly inspired from raw_tag_sents to do both

        def raw_tag_text():
            """
            Perform tokenizing sentence splitting and PosTagging and keep the 
            sentence splits structure
            """
            properties = {'annotators': 'tokenize,ssplit,pos'}
            tagged_data = self.parser.api_call(text, properties=properties)
            for tagged_sentence in tagged_data['sentences']:
                yield [(token['word'], token['pos']) for token in tagged_sentence['tokens']]

        tagged_text = list(raw_tag_text())

        if as_tuple_list:
            return tagged_text
        return '[ENDSENT]'.join(
            [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])
def get_stanford_nps(line):
    """
	Get noun phrases using the Stanford tagger
	"""

    noun_phrase_list = []
    parser = CoreNLPParser(url="http://localhost:9000")

    tag_list = list(parser.raw_parse(line))
    for item in tag_list:
        for subtree in item.subtrees():
            if subtree.label() == "NP":
                noun_phrase = " ".join(subtree.leaves())
                noun_phrase_list.append((noun_phrase))

    return noun_phrase_list
Ejemplo n.º 28
0
def ed_rip(word: str):
    NV = False
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    nlpinfo = nlp(word.lower())
    ripword = nlpinfo.sentences[0].words[0].lemma
    # Recapitalization
    if re.search('^[A-Z]', word) != None:
        ripword = ripword.capitalize()
    # Return information needed to determine NV Passive.
    riptoken = nltk.word_tokenize(ripword)
    riptag = pos_tagger.tag(riptoken)[0][1]
    print(riptoken, riptag)
    if riptag.startswith('V') is True:
        NV = False
    elif riptag.startswith('N') is True:
        NV = True
    return (ripword, NV)
Ejemplo n.º 29
0
def punctuation_funct(document, listterms):
    print(
        'Definición por punctuación [:, termino seguido de coma y acabando en coma verbo'
    )
    text = str()
    definiendum = str()
    definiendums = list()
    for i in document:
        for j in listterms:
            term = j[:-1]
            if (len(i) > 1):
                if (term + ':' in i):
                    ind = i.index(':')
                    after = i[ind + 1:]
                    if (len(after) > 1 and term not in definiendums):
                        definiendum = term
                        definition = after
                        definiendums.append(definiendum)
                        print(definiendum, '---->', definition)

                elif (term + ',' in i):
                    indterm = i.index(term)
                    if (',' in i[indterm + len(term):indterm + len(term) + 1]):
                        #print('-')
                        front = i[indterm:-1]
                        pos_tagger = CoreNLPParser('http://localhost:9003',
                                                   tagtype='pos')
                        tag = pos_tagger.tag(i.split(' '))
                        for t in tag:
                            if (t[1] == 'VERB'):
                                #print(front)
                                if (t[0] in i):
                                    #print(t[0])
                                    indverb = i.index(t[0])
                                    if (i[indverb - 2] == ','):
                                        definiendum = term
                                        definition = i[indterm + len(term) +
                                                       1:indverb]
                                        if (len(definiendum) > 1
                                                and len(definition) > 1
                                                and definiendum
                                                not in definiendums):
                                            definiendums.append(definiendum)
                                            print(definiendum, '---->',
                                                  definition)
Ejemplo n.º 30
0
    def __init__(self):

        # user need to download Stanford Parser, NER and POS tagger from stanford website
        #parserJar = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-parser-full-2017-06-09/stanford-parser.jar"
        #parserModel = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar"

        self.fullModule = CoreNLPParser(url='http://localhost:9000')
        self.customProperties = {
            'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse'
        }

        #self.constituent_parse_tree = CoreNLPParser(url='http://localhost:9000')

        # StanfordParser(path_to_jar=parserJar, path_to_models_jar=parserModel)
        # user need to set as environment variable
        #self.stanford_dependency =  CoreNLPDependencyParser(url='http://localhost:9000')
        #StanfordDependencyParser(path_to_jar=parserJar, path_to_models_jar=parserModel)
        # #user need to set as environment variable
        self.lemma = WordNetLemmatizer()

        ##self.ner = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
        #self.pos_tag = CoreNLPParser(url='http://localhost:9000', tagtype='pos')

        #self.home = '/home/ramesh'
        #user needs to download stanford packages and change directory
        #nerModelPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz"
        #nerJarPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-ner-2017-06-09/stanford-ner.jar"
        #self.ner = StanfordNERTagger(model_filename = nerModelPath,path_to_jar=nerJarPath)
        #posModelPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger"
        #posJarPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar"
        #self.pos_tag = StanfordPOSTagger(model_filename = posModelPath,path_to_jar=posJarPath)

        self.CharacterOffsetEnd = 0
        self.CharacterOffsetBegin = 0

        #this try block is just to check and make sure the server is up and running by
        #parsing a random sentence
        try:
            self.parser("A random sentence")
        except requests.ConnectionError as err:
            print(
                "The server doesn't seem to be running, make sure you start it"
            )
            raise SystemExit