コード例 #1
0
def main():
    TRAINING_INPUT_FILE = 'data/positive_negative_reviews_sentiment_2k.csv'
    OUTPUT_FILE = 'data/positive_negative_trigrams_2k.csv'
    rows = csv.getRows(TRAINING_INPUT_FILE)
    cols = csv.getHeader(TRAINING_INPUT_FILE)
    cols.append('trigrams')
    for row in rows:
        row.append('dummy data')
    csv.writeFile(OUTPUT_FILE, rows, cols)
    print cols

    # parser = stanford.StanfordParser(model_path="/location/of/the/englishPCFG.ser.gz")
    parser = StanfordParser(
        model_path=
        "/Users/rohankohli/Documents/workspace/CoreNLP/models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )
    sentences = parser.raw_parse_sents(
        ("Hello, My name is Melroy.", "What is your name?"))
    print sentences
    print sentences.next()
    return

    EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
    print(sent_tokenize(EXAMPLE_TEXT))
    return

    # text = 'Punkt knows that the periods in Mr. Smith and Johann S. Bach do not mark sentence boundaries. And sometimes sentences can start with non-capitalized words.  i is a good variable name.'
    # sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    # print('\n-----\n'.join(sent_detector.tokenize(text.strip())))

    return
コード例 #2
0
ファイル: parser.py プロジェクト: ptravers/centi
def parse_sentences(raw_sentences):
    parser = StanfordParser()

    raw_trees = parser.raw_parse_sents(raw_sentences)

    # Converts messy iterables into simple list of trees
    return [raw_tree[0] for sublist in raw_trees for raw_tree in sublist]
コード例 #3
0
ファイル: question.py プロジェクト: abiraja2004/NLP_Project
def getSentence(file_name, num_sentence):
    HOME_PATH = "E:/CMU/Natural Language Processing/StanfordNLP/stanford-parser-full-2017-06-09"
    # HOME_PATH = "/home/stanford-parser-full/stanford-parser-full-2017-06-09"
    os.environ['STANFORD_PARSER'] = HOME_PATH
    os.environ['STANFORD_MODELS'] = HOME_PATH
    # ENG_Parser = StanfordParser('stanford-parser-full-2017-06-09/stanford-parser.jar','stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar')
    ENG_Parser = StanfordParser(model_path=HOME_PATH + "/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    # check version of Python in current environment
    if sys.version_info < (3, 0):
        txt_file = open(file_name, 'r').read().decode('utf-8', 'ignore')
    else:
        txt_file = open(file_name, 'r', encoding='utf-8').read()
    sent_tokenize_list = sent_tokenize(txt_file)
    TITLE = sent_tokenize_list[0].split("\n")[0]
    print("Parsing the whole article, may take up to several minutes......")
    parsed_sentences = [sentence for line in ENG_Parser.raw_parse_sents(sent_tokenize_list) for sentence in line]
    index = 0
    where_index = 0
    for parse_tree in parsed_sentences:
        try:
            if index == num_sentence:
                break
            valid_parse_tree = checkNPVP(parse_tree)
            if valid_parse_tree:
                who_question = genWhoQuestion(valid_parse_tree)
                what_question = genWhatQuestion(valid_parse_tree)
                yn_question_right = genYesNoQuestion(valid_parse_tree, False)
                yn_question_wrong = genYesNoQuestion(valid_parse_tree, True)
                # why_question = genWhyQuestion(valid_parse_tree)
                if where_index < 10:
                    where_question = getWhereQuestion(" ".join(valid_parse_tree.leaves()))
                    if where_question:
                        where_index += 1
                        index += 1
                        print("Question " + str(index) + ": " + where_question)
                if who_question:
                    index += 1
                    print("Question " + str(index) + ": " + who_question)
                if what_question:
                    index += 1
                    print("Question " + str(index) + ": " + what_question)
                if yn_question_right:
                    index += 1
                    print("Question " + str(index) + ": " + yn_question_right)
                if yn_question_wrong:
                    index += 1
                    print("Question " + str(index) + ": " + yn_question_wrong)
                    # if why_question:
                    #     index += 1
                    #     print("Question " + str(index) + ": " + why_question)
        except:
            continue

    while index < num_sentence:
        index += 1
        print("Question " + str(index) + ": No more questions in this article...")
コード例 #4
0
ファイル: advanced.py プロジェクト: hajoki/EliseMichon_HW3
def dependencies():
    #english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.6.0-models.jar')
    #english_parser.raw_parse_sents(("this is the english parser test", "the parser is from stanford parser"))                         
    parser = StanfordParser(model_path="C:\Python27\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar")
    sentences = parser.raw_parse_sents(("IBlood B cells secrete PROTX1  ( s )   upon stimulation via the PROTX2.", "Furthermore ,  blocking PROTX0 or PROTX0 had no effect on the levels of PROTX2 released in response to the anti -  PROTX1 mAb."))
    print sentences

    # GUI
    for line in sentences:
        for sentence in line:
            sentence.draw()
コード例 #5
0
ファイル: NLTK.py プロジェクト: Jashgada/Summer-Research-2019
def parseComment(comment):
    parser = StanfordParser(
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    sentenceList = sent_tokenize(comment['body'])
    try:
        parsedList = list(parser.raw_parse_sents(sentenceList))
    except:
        print("Could not parse comment", end=' ')
        print(comment['comment_id'])
    else:
        parsedstring = ''.join(
            [' '.join([str(c) for c in lst]) for lst in parsedList])
        isQuest = questionIdentification(parsedstring)
        return isQuest
コード例 #6
0
ファイル: ProtoFile.py プロジェクト: daivikswarup/WLP-Parser
    def __gen_parse_trees(self):
        p_cache = os.path.join(cfg.PARSE_PICKLE_DIR, self.protocol_name + '.p')
        try:
            parse_trees = pickle.load(open(p_cache, 'rb'))

        except(pickle.UnpicklingError, EOFError, FileNotFoundError):

            parser = StanfordParser(path_to_jar=feat_cfg.STANFORD_PARSER_JAR,
                                    path_to_models_jar=feat_cfg.STANFORD_PARSER_MODEL_JAR,
                                    java_options="-mx3000m")
            temp_trees = list(parser.raw_parse_sents(self.lines[1:]))
            parse_trees = [next(trees) for trees in temp_trees]
            os.makedirs(os.path.dirname(p_cache), exist_ok=True)
            pickle.dump(parse_trees, open(p_cache, 'wb'))

        return parse_trees
コード例 #7
0
def sdfprocess(rvdata, partidx):
    parser=StanfordParser(path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx15000m')
    sdfdata=[]
    cnn = 1
    for eg in rvdata:
        if cnn%100 == 0: print "%f%% of document %d finished" % (cnn*100*1.0/len(rvdata), partidx+1)
        cmt = eg[3].decode('utf-8') #3 is the idx of comment
        sentences = nltk.sent_tokenize(cmt)
        sdfparsed = parser.raw_parse_sents(sentences)
        sdfdata.append(eg[:3]+[sdfparsed])
        # print cnn
        pprint(sdfparsed[2])
        # print sdfdata
        cnn += 1        
        if cnn > 5: break
    return sdfdata
コード例 #8
0
ファイル: advanced.py プロジェクト: hajoki/EliseMichon_HW3
def dependencies():
    #english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.6.0-models.jar')
    #english_parser.raw_parse_sents(("this is the english parser test", "the parser is from stanford parser"))
    parser = StanfordParser(
        model_path=
        "C:\Python27\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar"
    )
    sentences = parser.raw_parse_sents((
        "IBlood B cells secrete PROTX1  ( s )   upon stimulation via the PROTX2.",
        "Furthermore ,  blocking PROTX0 or PROTX0 had no effect on the levels of PROTX2 released in response to the anti -  PROTX1 mAb."
    ))
    print sentences

    # GUI
    for line in sentences:
        for sentence in line:
            sentence.draw()
コード例 #9
0
def get_stanford_nounphrases(sentences):
    global parser
    if not parser:
        print('Instantiate stanford parser...')
        parser = StanfordParser('./utils/stanford-parser.jar',
                                './utils/stanford-parser-3.6.0-models.jar')
    sents = list(map(lambda s: s.sent, sentences))
    trees = list(parser.raw_parse_sents(sents))
    noun_phrases = set()
    for tree in trees:
        tree = list(tree)[0]
        # print(tree)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                phrase = ' '.join(subtree.leaves())
                noun_phrases.add(phrase)
    return list(noun_phrases)
コード例 #10
0
ファイル: sdfpreprocess.py プロジェクト: cosmozhang/satire
def sdfprocess(tp, path, filenamels, docid):
    parser=StanfordParser(path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx5000m')
    sdfdata = []
    for i in range(len(filenamels)):
        if (i+1)%100 == 0: print "%f%% of document %d of %s finished" % ((i+1)*100*1.0/len(filenamels), docid, tp) 
        filename = filenamels[i]
        h = open(path + filename, 'r')
        lines = h.readlines()
        h.close()
        headraw, bodyraw = preprocess(lines[0]), preprocess(lines[1])

        sentences = [headraw] + nltk.sent_tokenize(bodyraw)
        sdfparsed = parser.raw_parse_sents(sentences)
        sdfdata.append(sdfparsed)
        # print sdfparsed
        # print sdfdata      
        # if i > 5: break
    return sdfdata
コード例 #11
0
class SentenceCompress:
    def __init__(self,
                 omega=0.001,
                 alpha=20,
                 beta=100,
                 path_to_jar=None,
                 path_to_models_jar=None,
                 word_bank=None):
        """ Initialize syntactic parser and parameters for word significance and
			desired sentence length."""
        self.parser = StanfordParser(path_to_jar=path_to_jar,
                                     path_to_models_jar=path_to_models_jar)
        self.omega = omega  # Proper noun importance
        self.alpha = alpha  # min sentence length in characters
        self.beta = beta  # max sentence length in characters
        self.parsed_sentences = None
        self.word_bank = word_bank

    def syntax_parse(self, sentences):
        """ Take list of Sentence objects and get syntactic parse trees. """
        self.parsed_sentences = self.parser.raw_parse_sents(
            [s.sentence for s in sentences])  # only testing w/ first 10

    def compress(self):
        """ Apply rules in set 0 and set 1. """
        compressed_sentences = []
        for list_iter in self.parsed_sentences:
            for t in list_iter:
                original = self.tree_to_sentence(t)
                # print('ORIGINAL')
                # print(t)
                # print(original)
                min_len = self.min_length(original)
                max_len = self.max_length(original)
                if len(original) >= min_len:
                    self.set_0(
                        t
                    )  # probably not goog that this relies on side effects
                    t = self.set_1(t, max_len, min_len)
                    s = self.tree_to_sentence(
                        t
                    )  # could check if this is above min and desired max length
                    compressed_sentences.append(s)
                    # print('TRIMMED')
                    # print(t)
                    # print(s)
        return compressed_sentences

    # input might be Ji hann's word class, which might include POS tag, named entity, that sort of thing
    # this should be a word object
    def word_significance(self, w):  # I_j(w_i)
        if w in self.word_bank.word_dict:
            word = self.word_bank.word_dict[w]
            if w[0].islower():  # for now. should be if common noun
                return word.tf * word.idf  # tf_ij x idf_i if w_i is verb or common noun
            elif w[0].isupper():  # for now. should if proper noun
                return word.tf * word.idf + self.omega  # tf_ij x idf_i + omega if w_i is proper noun
        return 0  # 0 otherwise

    def information_density_measurement(self):
        # TODO: implement (if needed)
        pass

    def min_length(self, sentence):
        """ Desired minimum length of sentence. """
        return min(len(sentence), self.alpha)

    def max_length(self, sentence):
        """ Desired maximum length of sentence, depending on length of original sentence. """
        orig_length = len(sentence)
        if orig_length > self.beta:
            return self.beta + sqrt(orig_length - self.beta)
        return orig_length

    def traverse_tree_set_0(self, tree, phrases):
        """ Trim elements matching phrase types in 'phrases'.
			Should this be iterative? """
        clause_sig = 0
        for index, node in enumerate(tree):  # iterate backwards?
            if type(node) == Tree:
                # can I immediately ignore some clauses
                sig = self.traverse_tree_set_0(
                    node, phrases
                )  # if subtree is too significant, don't remove. But what is too significant?
                # assign importance to clause, based on returned importance and importance of clause types
                clause_sig += sig
                # remove adverbs, parenthetical statements, and fragments
                if clause_sig < 0.01 and node.label(
                ) in phrases:  # should check that adverb is not negative
                    tree[index] = None
                elif clause_sig >= 0.01 and node.label() in phrases:
                    print("not getting rid of: ", self.tree_to_sentence(node))
            else:  # word string
                # return word_significance
                word_sig = self.word_significance(
                    node
                )  # I need to have a fast way of looking up word object
                if word_sig > self.omega:
                    clause_sig += word_sig
        return clause_sig

    def set_0(self, tree):
        """ Get rid of clauses that very likely arne't important. No need for iteration. """
        phrases = ['ADVP', 'PRN', 'FRAG', 'INTJ']
        self.traverse_tree_set_0(tree, phrases)

    def set_1_find_xp_levels(self, tree, decl_clause, level, found_xp):
        """ Get number of levels of outermost XP pattern.
			Pattern is [XP [XP ...] ... ] where XP is NP, VP, or S. """
        max_levels = level
        for index, node in enumerate(tree):
            if type(node) == Tree:
                if index == 0 and node.label() == decl_clause:
                    found_xp = True
                    levels = self.set_1_find_xp_levels(node, decl_clause,
                                                       level + 1, found_xp)
                    max_levels = max(levels, max_levels)
                elif not found_xp:  # shouldn't traverse if found outer level XP pattern already. just return max levels
                    levels = self.set_1_find_xp_levels(node, decl_clause,
                                                       level, found_xp)
        return max_levels

    def set_1_remove_outer_xp(self, tree, decl_clause):
        """ remove outermost tree in XP pattern. Find first subtree of type decl_clause and return.
			Iterate left to right, because if there's multiple options then return the leftmost subtree.
		"""
        for index, node in enumerate(tree):
            if type(node) == Tree:
                if node.label() == decl_clause:
                    # remove outer S by returning child.
                    for index2, child_node in enumerate(node):
                        if type(child_node) == Tree and child_node.label(
                        ) == decl_clause:
                            return node[index2]
                    # return tree[index,0] # not necessarily at 0 index...
                else:  # keep going down the tree...
                    subtree = self.set_1_remove_outer_xp(node, decl_clause)
                    if subtree is not None:
                        return subtree
        return None  # return self? idk

    def set_1_trailing(self, tree, phrase_type):
        """ Get rid of first trailing (deepest rightmost) PP or SBAR. Iteration is reversed so
			rightmost elements will be looked at first. """
        for index, node in reversed(list(enumerate(tree))):
            if type(node) == Tree:
                if index == len(tree) - 1 and node.label() == phrase_type:
                    tree[index] = None
                    return True
                else:
                    found = self.set_1_trailing(node, phrase_type)
                    if found:
                        return True
        return False

    def set_1(self, tree, max_len, min_len):
        """ Iteratively remove clauses and phrases in an attempt to reduce sentence to
			less than max_len. """
        XPs = ['S', 'NP', 'VP']
        for clause in XPs:
            current_sentence_len = len(self.tree_to_sentence(tree))
            if (current_sentence_len < max_len):
                break
            levels = self.set_1_find_xp_levels(tree, clause, 0, False)
            while levels > 1:
                current_sentence_len = len(self.tree_to_sentence(tree))
                if (current_sentence_len < max_len):
                    break
                tree = self.set_1_remove_outer_xp(tree, clause)
                levels = self.set_1_find_xp_levels(tree, clause, 0, False)
        trailing = ['PP', 'SBAR']
        for phrase in trailing:
            current_sentence_len = len(self.tree_to_sentence(tree))
            if (current_sentence_len < max_len):
                break
            self.set_1_trailing(tree, phrase)
        return tree

    def tree_to_sentence_helper(self, tree, sentence_str):
        """ Recursive helper to convert nltk tree, which may have nodes with value 'None',
			to sentence """
        for index, node in enumerate(tree):
            if type(node) == Tree:
                sentence_str = self.tree_to_sentence_helper(node, sentence_str)
            elif node != None:
                if node[0] in string.punctuation:
                    return sentence_str + node
                else:
                    return sentence_str + ' ' + node
        return sentence_str

    def tree_to_sentence(self, tree):
        """ convert nltk tree, which may have nodes with value 'None', to sentence. """
        s = self.tree_to_sentence_helper(tree, '').strip()
        if len(s) == 0:
            return s
        if s[0] in string.punctuation:
            s = s.lstrip(string.punctuation)
        if len(s) == 0:
            return s
        if s[0].islower():
            s = s[0].upper() + s[1:]
        if s[-1] not in string.punctuation:
            s = s + '.'
        return s
コード例 #12
0
ファイル: nlp.py プロジェクト: elainemartin/text-analytics
from nltk.tokenize import sent_tokenize
from nltk.tag.stanford import NERTagger
from nltk.parse.stanford import StanfordParser
from corenlp import StanfordCoreNLP

wsj = open('wsj_0063.txt')

#extract named entities
nerTagger=NERTagger('stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-08-27/stanford-ner.jar')
ner = []
for line in wsj:
	ner.append(nerTagger.tag(unicode(line,errors='ignore').split()))

#parse sentences
paragraph = ""
for line in wsj:
	paragraph += line.replace('\n',' ')
sentences = sent_tokenize(paragraph)
parser = StanfordParser('stanford-parser-full-2014-10-31/stanford-parser.jar','stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar')
parsed = parser.raw_parse_sents(sentences)

#coreference
corenlp_dir = "stanford-corenlp-full-2014-08-27"
corenlp = StanfordCoreNLP(corenlp_dir)
corenlp.batch_parse(paragraph)

wsj.close()
コード例 #13
0
    'D:\\SPJAIN\\NLP\\stanford-postagger-full-2017-06-09\\stanford-postagger-full-2017-06-09\\stanford-postagger.jar'
)
english_postagger.tag(
    'this is stanford postagger in nltk for python users'.split())

#Parser installation

#import nltk.tag.stanford
from nltk.parse.stanford import StanfordParser
from nltk import *

english_parser = StanfordParser(
    'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser.jar',
    'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser-3.8.0-models.jar'
)
sentences = english_parser.raw_parse_sents(
    ('this is the english parser test', 'the parser is from stanford parser'))

for myListiterator in sentences:
    for t in myListiterator:
        print(t)

english_parser = StanfordParser(
    'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser.jar',
    'D:\\SPJAIN\\NLP\\stanford-parser-full-2017-06-09\\stanford-parser-full-2017-06-09\\stanford-parser-3.8.0-models.jar'
)
sentences = english_parser.raw_parse_sents(
    ('I am Debjyoti Das and I am studying in SpJAIN',
     'the parser is from stanford parser'))

for myListiterator in sentences:
    for t in myListiterator:
コード例 #14
0
class SyntacticExtractor(SentenceExtractor):
    """ Tries to split sentences into sub-sentences so that each of them
        contains only one LU
    """

    splitter = None
    parser = None
    token_to_lemma = None
    all_verbs = None

    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        self.parser = StanfordParser(
            path_to_jar='dev/stanford-corenlp-3.6.0.jar',
            path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
            java_options=' -mx2G -Djava.ext.dirs=dev/')

        self.token_to_lemma = {}
        for lemma, tokens in self.lemma_to_token.iteritems():
            for t in tokens:
                self.token_to_lemma[t] = lemma
        self.all_verbs = set(self.token_to_lemma.keys())

    def extract_from_item(self, item):
        extracted = []
        bio = item.get(self.document_key, '').lower()
        url = item.get('url')
        if not bio or not url:
            logger.warn('skipping item without url or bio')
            return

        try:
            roots = self.parser.raw_parse_sents(self.splitter.split(bio))
        except (OSError, UnicodeDecodeError):
            logger.exception('cannot parse biography, skipping')
            return

        for root in roots:
            root = root.next()
            try:
                sub_sents = self.find_sub_sentences(root)
            except:
                logger.exception('cannot find sub-sentences')
                continue

            for sub in sub_sents:
                try:
                    text = ' '.join(chunk
                                    for _, chunk in self.find_terminals(sub))
                    logger.debug('processing text ' + text)
                    verbs = set(chunk
                                for _, chunk in self.find_terminals(sub, 'V'))
                except:
                    logger.exception('cannot extract verbs or parse sentence')
                    continue

                found = verbs.intersection(self.all_verbs)

                if len(found) == 0:
                    logger.debug('No matching verbs found in sub sentence')
                elif len(found) == 1:
                    extracted.append({
                        'lu': self.token_to_lemma[found.pop()],
                        'text': text,
                        'url': url,
                    })
                else:
                    logger.debug(
                        'More than one matching verbs found in sentence %s: %s',
                        text, repr(found))

        if extracted:
            logger.debug("%d sentences extracted...", len(extracted))
            return item, extracted
        else:
            logger.debug("No sentences extracted. Skipping the whole item ...")

    def find_sub_sentences(self, tree):
        # sub-sentences are the lowest S nodes in the parse tree
        if not isinstance(tree, Tree):
            return []

        s = reduce(lambda x, y: x + y, map(self.find_sub_sentences,
                                           iter(tree)), [])
        if tree.label() == 'S':
            return s or [tree]
        else:
            return s

    def find_terminals(self, tree, label=None):
        # finds all terminals in the tree with the given label prefix
        if len(tree) == 1 and not isinstance(tree[0], Tree):
            if label is None or tree.label().startswith(label):
                yield (tree.label(), tree[0])
        else:
            for child in tree:
                for each in self.find_terminals(child, label):
                    yield each
コード例 #15
0
class TextProcessing:


	def __init__(self):
		
		# print "Inside ntlk util"
		self.constituent_parse_tree = StanfordParser()
		self.stanford_dependency = StanfordDependencyParser()
		self.lemma = WordNetLemmatizer()
		self.home = '/home/ramesh/Documents/mas_course/second_semester/rnd/rnd_submission_cd'
		self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
		self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
		self.CharacterOffsetEnd = 0 
		self.CharacterOffsetBegin = 0
		self.contractions = {"'nt":"not", "'ll": " will", "'re":"are", "'ve":"have", "'m":"am"}
		

	'''
	Input: sentence
	Returns: 
	'''


	def parser(self,sentence):


		# self.parseResult = {'parseTree':[], 'text':[], 'dependencies':[],'words':[] }
		self.parseResult = {'text':[], 'dependencies':[],'words':[] }

		# sentence = re.sub(r'\..', '.', sentence)

		parseText, sentences = self.getParseText(sentence)
		# print "sentences ", sentences
		# if source/target sent consist of 1 sentence 
		if len(sentences) == 1:
			return parseText
		
		wordOffSet = 0 # offset is number of words in first sentence 

		# if source/target sentence has more than 1 sentence

		for i in xrange(len(parseText['text'])):
			if i > 0:

				for j in xrange(len(parseText['dependencies'][i])):
					# [root, Root-0, dead-4]
					for k in xrange(1,3):
						tokens = parseText['dependencies'][i][j][k].split('-')

						if tokens[0] == 'Root':
							newWordIndex = 0

						else:
							if not tokens[len(tokens)-1].isdigit():
								continue 

							newWordIndex = int(tokens[len(tokens)-1]) + wordOffSet

						if len(tokens) == 2:
							parseText['dependencies'][i][j][k] = tokens[0] + '-'

							#original one
							# parseText['dependencies'][i][j][k] = tokens[0]+ '-' + str(newWordIndex)
												
						else:
							w = ''
							for l in xrange(len(tokens)-1):
								w += tokens[l]
								if l<len(tokens)-2:
									w += '-'

							parseText['dependencies'][i][j][k] = w + '-'
							#original one
							# parseText['dependencies'][i][j][k] = w + '-' + str(newWordIndex)

			wordOffSet += len(parseText['words'][i])


		return parseText


	'''
	Using Stanford POS Tagger
	Input: parserResult 
	Returns: [[charBegin,charEnd], wordIndex(starts from 1), word, word_POS]] 
	'''


	def combine_lemmaAndPosTags(self,parserResult):

		res = []
		
		wordIndex = 1
		for i in xrange(len(parserResult['words'])):
			
			for j in xrange(len(parserResult['words'][i])):
				
				tag = [[parserResult['words'][i][j][1]['CharacterOffsetBegin'], \
					parserResult['words'][i][j][1]['CharacterOffsetEnd']], \
					wordIndex,parserResult['words'][i][j][0], \
					parserResult['words'][i][j][1]['Lemma'], \
						parserResult['words'][i][j][1]['PartOfSpeech'] ]
				wordIndex += 1
				res.append(tag)

		return res


	'''
	Input: parserResult
	Returns: ([charOffsetBegin,charOffsetEnd], wordindex,word, NER ])
	'''


	def nerWordAnnotator(self,parserResult):

		res = []
		
		wordIndex = 1
		for i in xrange(len(parserResult['words'])):
			
			for j in xrange(len(parserResult['words'][i])):
				
				tag = [ [parserResult['words'][i][j][1]['CharacterOffsetBegin'], parserResult['words'][i][j][1]['CharacterOffsetEnd']], wordIndex,parserResult['words'][i][j][0] ,parserResult['words'][i][j][1]['NamedEntityTag'] ]
				# print "tag ", tag
				wordIndex += 1
				# if there is valid named entity then add in list
				if tag[3] != 'O':

					res.append(tag)

		return res


	'''
	Input : ParserResult
	Returns : list containing NamedEntites
	1. Group words in same list if they share same NE (Location), 
    2. Save other words in list that have any entity
	'''


	def get_ner(self,parserResult):


		nerWordAnnotations = self.nerWordAnnotator(parserResult) #[[ [charbegin,charEnd], wordIndex, word, NE ]]
		namedEntities = []
		currentWord = []
		currentCharacterOffSets = []
		currentWordOffSets = []

		for i in xrange(len(nerWordAnnotations)):

			if i == 0:

				currentWord.append(nerWordAnnotations[i][2]) # word having NE
				currentCharacterOffSets.append(nerWordAnnotations[i][0]) # [begin,end]
				currentWordOffSets.append(nerWordAnnotations[i][1]) # Word Index
				# if there is only one ner Word tag
				if (len(nerWordAnnotations) == 1):
					namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \
						currentWord, nerWordAnnotations[i-1][3] ])
					# print "named Entities ", namedEntities
					break 
				continue
			# if consecutive tags have same NER Tag, save them in one list
			if nerWordAnnotations[i][3] == nerWordAnnotations[i-1][3] and \
					nerWordAnnotations[i][1] == nerWordAnnotations[i-1][1] + 1:
				
				currentWord.append(nerWordAnnotations[i][2]) # word having NE
				currentCharacterOffSets.append(nerWordAnnotations[i][0]) # [begin,end]
				currentWordOffSets.append(nerWordAnnotations[i][1]) # Word Index

				if i == (len(nerWordAnnotations) - 1):
					namedEntities.append([ currentCharacterOffSets, \
						currentWordOffSets, currentWord, nerWordAnnotations[i][3] ])
			# if consecutive tags do not match
			else:

				namedEntities.append([ currentCharacterOffSets, \
						currentWordOffSets, currentWord, nerWordAnnotations[i-1][3] ])
				currentWord = [nerWordAnnotations[i][2]]
				# remove everything from currentCharacterOffSets and currentWordOffSets
				currentCharacterOffSets = []
				currentWordOffSets = []
				# add charac offsets and currentWordOffSets of current word
				currentCharacterOffSets.append(nerWordAnnotations[i][0])
				currentWordOffSets.append(nerWordAnnotations[i][1])

				# if it is last iteration then update named Entities
				if i == len(nerWordAnnotations)-1:
					namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \
							currentWord, nerWordAnnotations[i][3] ])
		#sort out according to len of characters in ascending order
		namedEntities = sorted(namedEntities, key=len)

		return namedEntities


	'''
	Input: Word(Word whose NE is not found), NE(word already have NE Tag) 
	Returns: Boolean; True if word is acronym
					False if word is not acronym
	'''


	def is_Acronym(self,word,NE):


		queryWord = word.replace('.','')
		# If all words of queryWord is not capital or length of word != 
				#length of NE(word already have NE Tag) or 
		   #  if word is 'a' or 'i' 
		if not queryWord.isupper() or len(queryWord) != len(NE) or queryWord.lower() in ['a', 'i']:
			return False

		acronym = True

		#we run for loop till length of query word(i.e 3)(if word is 'UAE')
		#Compare 1st letter(U) of query word with first letter of first element in named entity(U = U(united))
		# again we take second letter of canonical word (A) with second element in named entity(Arab)
		# and so on 
		for i in xrange(len(queryWord)):
			# print "queryword[i], NE ", queryWord, NE
			if queryWord[i] != NE[i][0]:
				acronym = False
				break

		return acronym


	'''
	Input: sentence
	Returns: parse(	{ParseTree, text, Dependencies, 
	  'word : [] NamedEntityTag, CharacterOffsetEnd, 
	  		CharacterOffsetBegin, PartOfSpeech, Lemma}']}) 
	  		sentence and
	'''


	def getParseText(self,sentence):

		self.count = 0
		self.length_of_sentence = [] # stores length of each sentence
		
		sentence = re.sub(r'([a-z]\.)([\d])', r'\1 \2', sentence)
		sentence = re.sub(r'(\)\.)([\d])', r'\1 \2', sentence)
		sentence = re.sub(r'([\d]\.)([\d])', r'\1 \2', sentence)
		sentence = re.sub(r'([a-z]\.)([A-Z])', r'\1 \2', sentence)
		sentence = re.sub(r'(\.)([A-Z]|[a-z])', r'\1 \2', sentence)
		sentence = re.sub(r'([*]|[+]|[-]|[=])([A-Z]|[a-z])', r'\1 \2', sentence)
		sentence = re.sub(r'([A-Z]|[a-z])([*]|[+]|[-]|[=])', r'\1 \2', sentence)
		sentence = re.sub(r'([*]|[+]|[-]|[=])([\d])', r'\1 \2', sentence)
		sentence = re.sub(r'([\d])([*]|[+]|[-]|[=])', r'\1 \2', sentence)

		if '[' in sentence:
			sentence = sentence.replace('[', ' [ ')

		if ']' in sentence:
			sentence = sentence.replace(']', ' ] ')

		if '/' in sentence:
			sentence = sentence.replace('/' , ' / ')

		if '//' in sentence:
			sentence = sentence.replace('//' , ' // ')

		if '{' in sentence:
			# print "came {"
			sentence = sentence.replace('{', ' { ')

		if '}' in sentence:
			# print "came }"
			sentence = sentence.replace('}', ' } ')

		if '(' in sentence:
			sentence = sentence.replace('(', ' ( ')

		if ')' in sentence:
			sentence = sentence.replace(')', ' ) ')

		if '$' in sentence:

			sentence = sentence.replace('$','')

		if '\\' in sentence:
			sentence = sentence.replace('\\',' ')

		if '|' in sentence:
			sentence = sentence.replace('|',' ')

		if 'times' in sentence:
			sentence = sentence.replace('times','x')

		if 'lambda' in sentence:
			sentence = sentence.replace('lambda', ' lambda ')



		tokenized_sentence = sent_tokenize(sentence)
		# print "len of tokenized ",len(tokenized_sentence)
		if (len(tokenized_sentence) == 1):
			self.count += 1
			for i in tokenized_sentence:
				parse = self.getCombineWordsParam(i)
		else:
			tmp = 0
			for i in tokenized_sentence:
				self.count += 1
				parse = self.getCombineWordsParam(i)
				s = len(i) + tmp
				self.length_of_sentence.append(s)
				tmp = s

		return parse,tokenized_sentence
		

	'''
	Input: sentences
    Return: constituency tree that represents relations between sub-phrases in sentences
    Not using for ASAG
	'''


	def getConstituencyTree(self, sentence):
		
		sentence = sent_tokenize(sentence)
		constituency_parser = self.constituent_parse_tree.raw_parse_sents(sentence)
		for parser in constituency_parser:
			for sent in parser:
				tree = str(sent)
		parse_string = ' '.join(str(tree).split()) 
        
		return parse_string


	'''
	Input: sentence
	returns: relation between words with their index
	'''	


	def getDependencies(self, sentence):

		#if first letter of sentence is '-', take rest sentence except hyphen

		if '#' in sentence:
			sentence = sentence.replace('#','')

 
		dependency_tree = []
		dependency_parser = StanfordDependencyParser().raw_parse(sentence)
		token = word_tokenize(sentence)
		parsetree = list(StanfordDependencyParser().raw_parse(sentence))[0]
		# Find root(head) of the sentence 
		for k in parsetree.nodes.values():
			# print "k ", k
			if k["head"] == 0:

				dependency_tree.append([str(k["rel"]), "Root-", str(k["word"] + "-") 
					 ])	    	

		# Find relation between words in sentences
		for dep in dependency_parser:
			# print "dep ", dep.triples()
			for triple in dep.triples():
				
				dependency_tree.append([str(triple[1]),str(triple[0][0]) + "-" ,\
							 str(triple[2][0]) + "-"])


		return dependency_tree


	'''
	Input: sentence, word(of which offset to determine)
	Return: [CharacterOffsetEnd,CharacterOffsetBegin] for each word
	'''


	def getCharOffSet(self,sentence, word):

		CharacterOffsetBegin = sentence.find(word)
		CharacterOffsetEnd = CharacterOffsetBegin + len(word)
		
		return [CharacterOffsetEnd,CharacterOffsetBegin]


	'''
	Input: sentence
	Returns: dictionary: 
	{ParseTree, text, Dependencies, 
	  #'word : [] NamedEntityTag, CharacterOffsetEnd, CharacterOffsetBegin, PartOfSpeech, Lemma}']}
	'''


	def getCombineWordsParam(self, sentence):
		

		# print " tokenized sentence in nltkUtil", sentence

		if sentence[0] == '-':
			sentence = sentence.split('-', 1)[1]

		

		words_list = [] 
		tokenized_words = word_tokenize(sentence)
		# print "tokenized words ", tokenized_words
		sentence = []
		#expand contractions 
		for i in tokenized_words:
			if i in self.contractions:
				sentence.append(self.contractions[i])
			else:
				sentence.append(i)
		sentence = " ".join(sentence)
		
		tokenized_words = word_tokenize(sentence)

		posTag = self.pos_tag.tag(tokenized_words)
		# print "pos Tag ", posTag
		ner = self.ner.tag(tokenized_words)
		# print "ner ", ner
		# if source sentence/target sentence has one sentence
		if (self.count == 1):
			for i in xrange(len(tokenized_words)):
				word_lemma = str()
				word = tokenized_words[i]
				name_entity = ner[i]
				word_posTag = posTag[i][-1]  # access tuple [(United, NNP),..]
				# print "word and pos tag ", word, word_posTag[0]	
				#wordNet lemmatizer needs pos tag with words else it considers noun
				if (word_posTag[0] == 'V'):
					word_lemma = self.lemma.lemmatize(word.lower(), wordnet.VERB)

				elif (word_posTag[0] == 'J'):
					word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADJ)

				elif (word_posTag[0:1] == 'RB'):
					word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADV)

				else:
					if (word == 'I'):
						# doing this because stanford lemmatize of 'I' is not present in stopwords
						word_lemma = self.lemma.lemmatize(word)
					else:
						word_lemma = self.lemma.lemmatize(word.lower())

				self.CharacterOffsetEnd, self.CharacterOffsetBegin = self.getCharOffSet(sentence,word)
				words_list.append([word, {"NamedEntityTag" : str(name_entity[1]),
					"CharacterOffsetEnd" : str(self.CharacterOffsetEnd), "CharacterOffsetBegin" : str(self.CharacterOffsetBegin) 
					,"PartOfSpeech" : str(word_posTag) , "Lemma" : str(word_lemma)}])

			# self.parseResult['parseTree'] = [self.getConstituencyTree(sentence)]
			self.parseResult['text'] = [sentence]
			self.parseResult['dependencies'] = [self.getDependencies(sentence)]
			self.parseResult['words'] = [words_list]

		else:
			for i in xrange(len(tokenized_words)):
				word = tokenized_words[i]
				name_entity = ner[i] 
				word_posTag = posTag[i][-1]

				if (word_posTag[0] == 'V'):
					word_lemma = self.lemma.lemmatize(word.lower(), wordnet.VERB)

				elif (word_posTag[0] == 'J'):
					word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADJ)

				elif (word_posTag[0:1] == 'RB'):
					word_lemma = self.lemma.lemmatize(word.lower(), wordnet.ADV)

				else:
					if (word == 'I'):
						# doing this because stanford lemmatize of 'I' is not present in stopwords
						word_lemma = self.lemma.lemmatize(word)
					else:
						word_lemma = self.lemma.lemmatize(word.lower())

				end, begin = self.getCharOffSet(sentence,word)
				end = end + self.length_of_sentence[self.count-2] + 1
				begin = begin + self.length_of_sentence[self.count-2] + 1	
				words_list.append([word, {"NamedEntityTag" : str(name_entity[1]),
					"CharacterOffsetEnd" : str(end), "CharacterOffsetBegin" : str(begin) 
					,"PartOfSpeech" : str(word_posTag) , "Lemma" : str(word_lemma)}])
				
			self.parseResult['text'].append(sentence)
			self.parseResult['dependencies'].append(self.getDependencies(sentence))
			self.parseResult['words'].append(words_list)

		return self.parseResult
コード例 #16
0
# -*- coding: utf-8 -*-
"""
Created on 2018/6/24

@author: Samuel
@Desc: 
@dependence: Noting
"""
from nltk.parse.stanford import StanfordParser

english_parser = StanfordParser('standford-parser.jar',
                                'standfordparser-3.8.0-models.jar')
english_parser.raw_parse_sents('This is the english parser test.')
コード例 #17
0
class QuestionGenerator(object):
    def __init__(self):
        self.english_parser = StanfordParser(
            'StanfordCoreNLP/stanford-parser.jar',
            'StanfordCoreNLP/stanford-parser-3.4.1-models.jar')
        self.wordID = WordIdentity()

    """
    VB  Verb, base form
    VBD Verb, past tense
    VBG Verb, gerund or present participle
    VBN Verb, past participle
    VBP Verb, non-3rd person singular present
    VBZ Verb, 3rd person singular present
    MD model auxiliary
    have has had
    beingVerb
    How to prune?
    """
    """
    pruning
    """

    def generateYesNoQuestionNVN(self, sent):
        tree = self.english_parser.raw_parse_sents((sent, ))
        root = tree[0]
        NPTree, VPTree = self.splitNVNTree(root)
        rephrasedNPVPQ = self.rephraseQuestion(NPTree, VPTree)
        return rephrasedNPVPQ

    def rephraseQuestion(self, NPTree, VPTree):
        VP_POS = VPTree.pos()
        VPLeaves = VPTree.leaves()
        NPLeaves = NPTree.leaves()
        VPEnd = 0
        while is_verb(VP_POS[VPEnd][1]):
            VPEnd = VPEnd + 1
        if self.wordID.isBeingVerb(VP_POS[0][0]) or VP_POS[0][1] == 'MD':
            prefix = VP_POS[0][0]
            NP = ' '.join(NPLeaves)
            VP = ' '.join(VPLeaves[1:])
            return prefix + ' ' + NP + ' ' + VP
        elif VPEnd == 1:
            if VP_POS[0][1] == r'VBP*\b':
                prefix = 'do'
            elif VP_POS[0][1] == 'VBD':
                prefix = 'did'
            elif VP_POS[0][1] == 'VBZ':
                prefix = 'does'
            NP = ' '.join(NPLeaves)
            VP1 = WordNetLemmatizer().lemmatize(VPLeaves[0], 'v')
            VP2 = ' '.join(VPLeaves[1:])
            return prefix + ' ' + NP + ' ' + VP1 + ' ' + VP2
        elif VPEnd > 1:
            prefix = VP_POS[0][0]
            NP = ' '.join(NPLeaves)
            VP = ' '.join(VPLeaves[1:])
            return prefix + ' ' + NP + ' ' + VP
        else:
            return "Not yet implement"

    def splitNVNTree(self, root):
        if len(root) == 1 and (root.label() == 'ROOT' or root.label() == 'S'):
            return self.splitNVNTree(root[0])
        else:
            if root[0].label() == 'NP' and root[1].label() == 'VP':
                return root[0], root[1]
            else:
                return None, None

    def getleftMostVBLabel(self, root):
        if root.label() == 'VP':
            return self.getFirstVBLabel(root[0])
        else:
            return root.label()

    def testAsking(self):
        # q1 = "it is the country's principal political, cultural, commercial, industrial, and transportation centre, sometimes described as the primate city of Hungary"
        q1 = "the cat is sleeping"
        print 'question:', q1
        print 'answer:', self.generateYesNoQuestionNVN(q1)
コード例 #18
0
ファイル: nltkUtil.py プロジェクト: ChenRIT/Essentia
class Text_processing:
    def __init__(self):

        # user need to download Stanford Parser, NER and POS tagger from stanford website
        self.constituent_parse_tree = StanfordParser(
        )  #user need to set as environment variable
        self.stanford_dependency = StanfordDependencyParser(
        )  #user need to set as environment variable
        self.lemma = WordNetLemmatizer()
        self.home = '/home/ramesh'
        #user needs to download stanford packages and change directory
        self.ner = StanfordNERTagger(
            self.home +
            '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
            self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
        self.pos_tag = StanfordPOSTagger(
            self.home +
            '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',
            self.home +
            '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
        self.CharacterOffsetEnd = 0
        self.CharacterOffsetBegin = 0

    '''
	Input: sentence
	Returns: 
	'''

    def parser(self, sentence):

        self.parseResult = {
            'parseTree': [],
            'text': [],
            'dependencies': [],
            'words': []
        }
        parseText, sentences = self.get_parseText(sentence)
        # print "sentences ", sentences
        # if source/target sent consist of 1 sentence
        if len(sentences) == 1:
            return parseText

        wordOffSet = 0  # offset is number of words in first sentence

        # if source/target sentence has more than 1 sentence

        for i in xrange(len(parseText['text'])):
            if i > 0:

                for j in xrange(len(parseText['dependencies'][i])):
                    # [root, Root-0, dead-4]
                    for k in xrange(1, 3):
                        tokens = parseText['dependencies'][i][j][k].split('-')

                        if tokens[0] == 'Root':
                            newWordIndex = 0

                        else:
                            if not tokens[len(tokens) - 1].isdigit():
                                continue

                            newWordIndex = int(
                                tokens[len(tokens) - 1]) + wordOffSet

                        if len(tokens) == 2:

                            parseText['dependencies'][i][j][
                                k] = tokens[0] + '-' + str(newWordIndex)

                        else:
                            w = ''
                            for l in xrange(len(tokens) - 1):
                                w += tokens[l]
                                if l < len(tokens) - 2:
                                    w += '-'

                            parseText['dependencies'][i][j][k] = w + '-' + str(
                                newWordIndex)

            wordOffSet += len(parseText['words'][i])

        return parseText

    '''
	Input: parserResult 
	Returns: [[charBegin,charEnd], wordIndex(starts from 1), word, word_lemma]] 
	'''

    def get_lemma(self, parserResult):

        res = []
        wordIndex = 1
        for i in xrange(len(parserResult['words'])):

            for j in xrange(len(parserResult['words'][i])):

                tag = [[
                    parserResult['words'][i][j][1]['CharacterOffsetBegin'],
                    parserResult['words'][i][j][1]['CharacterOffsetEnd']
                ], wordIndex, parserResult['words'][i][j][0],
                       parserResult['words'][i][j][1]['Lemma']]
                wordIndex += 1
                res.append(tag)

        return res

    '''
	Using Stanford POS Tagger
	Input: parserResult 
	Returns: [[charBegin,charEnd], wordIndex(starts from 1), word, word_POS]] 
	'''

    def combine_lemmaAndPosTags(self, parserResult):

        res = []

        wordIndex = 1
        for i in xrange(len(parserResult['words'])):

            for j in xrange(len(parserResult['words'][i])):

                tag = [[
                    parserResult['words'][i][j][1]['CharacterOffsetBegin'],
                    parserResult['words'][i][j][1]['CharacterOffsetEnd']
                ], wordIndex, parserResult['words'][i][j][0],
                       parserResult['words'][i][j][1]['Lemma'],
                       parserResult['words'][i][j][1]['PartOfSpeech']]
                wordIndex += 1
                res.append(tag)

        return res

    '''
	Input: parserResult
	Returns: ([charOffsetBegin,charOffsetEnd], wordindex,word, NER ])
	'''

    def nerWordAnnotator(self, parserResult):

        res = []

        wordIndex = 1
        for i in xrange(len(parserResult['words'])):

            for j in xrange(len(parserResult['words'][i])):

                tag = [[
                    parserResult['words'][i][j][1]['CharacterOffsetBegin'],
                    parserResult['words'][i][j][1]['CharacterOffsetEnd']
                ], wordIndex, parserResult['words'][i][j][0],
                       parserResult['words'][i][j][1]['NamedEntityTag']]
                # print "tag ", tag
                wordIndex += 1
                # if there is valid named entity then add in list
                if tag[3] != 'O':

                    res.append(tag)

        return res

    '''
	Input : ParserResult
	Returns : list containing NamedEntites
	1. Group words in same list if they share same NE (Location), 
    2. Save other words in list that have any entity
	'''

    def get_ner(self, parserResult):

        nerWordAnnotations = self.nerWordAnnotator(
            parserResult)  #[[ [charbegin,charEnd], wordIndex, word, NE ]]
        namedEntities = []
        currentWord = []
        currentCharacterOffSets = []
        currentWordOffSets = []

        for i in xrange(len(nerWordAnnotations)):

            if i == 0:

                currentWord.append(nerWordAnnotations[i][2])  # word having NE
                currentCharacterOffSets.append(
                    nerWordAnnotations[i][0])  # [begin,end]
                currentWordOffSets.append(
                    nerWordAnnotations[i][1])  # Word Index
                # if there is only one ner Word tag
                if (len(nerWordAnnotations) == 1):
                    namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \
                     currentWord, nerWordAnnotations[i-1][3] ])
                    # print "named Entities ", namedEntities
                    break
                continue
            # if consecutive tags have same NER Tag, save them in one list
            if nerWordAnnotations[i][3] == nerWordAnnotations[i-1][3] and \
              nerWordAnnotations[i][1] == nerWordAnnotations[i-1][1] + 1:

                currentWord.append(nerWordAnnotations[i][2])  # word having NE
                currentCharacterOffSets.append(
                    nerWordAnnotations[i][0])  # [begin,end]
                currentWordOffSets.append(
                    nerWordAnnotations[i][1])  # Word Index

                if i == (len(nerWordAnnotations) - 1):
                    namedEntities.append([ currentCharacterOffSets, \
                     currentWordOffSets, currentWord, nerWordAnnotations[i][3] ])
            # if consecutive tags do not match
            else:

                namedEntities.append([ currentCharacterOffSets, \
                  currentWordOffSets, currentWord, nerWordAnnotations[i-1][3] ])
                currentWord = [nerWordAnnotations[i][2]]
                # remove everything from currentCharacterOffSets and currentWordOffSets
                currentCharacterOffSets = []
                currentWordOffSets = []
                # add charac offsets and currentWordOffSets of current word
                currentCharacterOffSets.append(nerWordAnnotations[i][0])
                currentWordOffSets.append(nerWordAnnotations[i][1])

                # if it is last iteration then update named Entities
                if i == len(nerWordAnnotations) - 1:
                    namedEntities.append([ currentCharacterOffSets, currentWordOffSets, \
                      currentWord, nerWordAnnotations[i][3] ])
        #sort out according to len of characters in ascending order
        namedEntities = sorted(namedEntities, key=len)

        return namedEntities

    '''
	Input: Word(Word whose NE is not found), NE(word already have NE Tag) 
	Returns: Boolean; True if word is acronym
					False if word is not acronym
	'''

    def is_Acronym(self, word, NE):

        queryWord = word.replace('.', '')
        # If all words of queryWord is not capital or length of word !=
        #length of NE(word already have NE Tag) or
        #  if word is 'a' or 'i'
        if not queryWord.isupper() or len(queryWord) != len(
                NE) or queryWord.lower() in ['a', 'i']:
            return False

        acronym = True

        #we run for loop till length of query word(i.e 3)(if word is 'UAE')
        #Compare 1st letter(U) of query word with first letter of first element in named entity(U = U(united))
        # again we take second letter of canonical word (A) with second element in named entity(Arab)
        # and so on
        for i in xrange(len(queryWord)):
            # print "queryword[i], NE ", queryWord, NE
            if queryWord[i] != NE[i][0]:
                acronym = False
                break

        return acronym

    '''
	Input: sentence
	Returns: parse(	{ParseTree, text, Dependencies, 
	  'word : [] NamedEntityTag, CharacterOffsetEnd, 
	  		CharacterOffsetBegin, PartOfSpeech, Lemma}']}) 
	  		sentence and
	'''

    def get_parseText(self, sentence):

        self.count = 0
        self.length_of_sentence = []  # stores length of each sentence
        tokenized_sentence = sent_tokenize(sentence)
        # print "len of tokenized ",len(tokenized_sentence)
        if (len(tokenized_sentence) == 1):
            self.count += 1
            for i in tokenized_sentence:
                parse = self.get_combine_words_param(i)
        else:
            tmp = 0
            for i in tokenized_sentence:
                self.count += 1
                parse = self.get_combine_words_param(i)
                s = len(i) + tmp
                self.length_of_sentence.append(s)
                tmp = s

        return parse, tokenized_sentence

    '''
	Input: sentences
    Return: constituency tree that represents relations between sub-phrases in sentences
	'''

    def get_constituency_Tree(self, sentence):

        sentence = sent_tokenize(sentence)
        constituency_parser = self.constituent_parse_tree.raw_parse_sents(
            sentence)
        for parser in constituency_parser:
            for sent in parser:
                tree = str(sent)
        parse_string = ' '.join(str(tree).split())

        return parse_string

    '''
	Input: sentence
	returns: relation between words with their index
	'''

    def get_dependencies(self, sentence):

        dependency_tree = []
        dependency_parser = self.stanford_dependency.raw_parse(sentence)
        token = word_tokenize(sentence)
        parsetree = list(self.stanford_dependency.raw_parse(sentence))[0]
        # Find root(head) of the sentence
        for k in parsetree.nodes.values():
            if k["head"] == 0:

                dependency_tree.append([
                    str(k["rel"]), "Root-" + str(k["head"]),
                    str(k["word"]) + "-" + str(k["address"])
                ])
        # Find relation between words in sentence
        for dep in dependency_parser:
            for triple in dep.triples():
                index_word = token.index(
                    triple[0][0]) + 1  # because index starts from 0
                index2_word = token.index(triple[2][0]) + 1
                dependency_tree.append([str(triple[1]),str(triple[0][0]) + "-" + str(index_word),\
                    str(triple[2][0]) + "-" + str(index2_word)])

        return dependency_tree

    '''
	Input: sentence, word(of which offset to determine)
	Return: [CharacterOffsetEnd,CharacterOffsetBegin] for each word
	'''

    def get_charOffset(self, sentence, word):

        # word containing '.' causes problem in counting

        CharacterOffsetBegin = sentence.find(word)
        CharacterOffsetEnd = CharacterOffsetBegin + len(word)

        return [CharacterOffsetEnd, CharacterOffsetBegin]

    '''
	Input: sentence
	Returns: dictionary: 
	{ParseTree, text, Dependencies, 
	  #'word : [] NamedEntityTag, CharacterOffsetEnd, CharacterOffsetBegin, PartOfSpeech, Lemma}']}
	'''

    def get_combine_words_param(self, sentence):

        words_in_each_sentence = []
        words_list = []
        tokenized_words = word_tokenize(sentence)
        posTag = self.pos_tag.tag(tokenized_words)
        ner = self.ner.tag(tokenized_words)

        # if source sentence/target sentence has one sentence
        if (self.count == 1):
            for i in xrange(len(tokenized_words)):
                word_lemma = str()
                word = tokenized_words[i]
                name_entity = ner[i]
                word_posTag = posTag[i][-1]  # access tuple [(United, NNP),..]
                # print "word and pos tag ", word, word_posTag[0]
                #wordNet lemmatizer needs pos tag with words else it considers noun
                if (word_posTag[0] == 'V'):
                    word_lemma = self.lemma.lemmatize(tokenized_words[i],
                                                      wordnet.VERB)

                elif (word_posTag[0] == 'J'):
                    word_lemma = self.lemma.lemmatize(tokenized_words[i],
                                                      wordnet.ADJ)

                elif (word_posTag[0:1] == 'RB'):
                    word_lemma = self.lemma.lemmatize(tokenized_words[i],
                                                      wordnet.ADV)

                else:
                    word_lemma = self.lemma.lemmatize(tokenized_words[i])

                self.CharacterOffsetEnd, self.CharacterOffsetBegin = self.get_charOffset(
                    sentence, tokenized_words[i])

                words_list.append([
                    word, {
                        "NamedEntityTag": str(name_entity[1]),
                        "CharacterOffsetEnd": str(self.CharacterOffsetEnd),
                        "CharacterOffsetBegin": str(self.CharacterOffsetBegin),
                        "PartOfSpeech": str(word_posTag),
                        "Lemma": str(word_lemma)
                    }
                ])

            self.parseResult['parseTree'] = [
                self.get_constituency_Tree(sentence)
            ]
            self.parseResult['text'] = [sentence]
            self.parseResult['dependencies'] = [
                self.get_dependencies(sentence)
            ]
            self.parseResult['words'] = [words_list]

        else:

            for i in xrange(len(tokenized_words)):
                word = tokenized_words[i]
                name_entity = ner[i]
                word_posTag = posTag[i][-1]

                if (word_posTag[0] == 'V'):
                    word_lemma = self.lemma.lemmatize(tokenized_words[i],
                                                      wordnet.VERB)

                elif (word_posTag[0] == 'J'):
                    word_lemma = self.lemma.lemmatize(tokenized_words[i],
                                                      wordnet.ADJ)

                elif (word_posTag[0:1] == 'RB'):
                    word_lemma = self.lemma.lemmatize(tokenized_words[i],
                                                      wordnet.ADV)

                else:
                    word_lemma = self.lemma.lemmatize(tokenized_words[i])

                end, begin = self.get_charOffset(sentence, tokenized_words[i])
                end = end + self.length_of_sentence[self.count - 2] + 1
                begin = begin + self.length_of_sentence[self.count - 2] + 1
                words_list.append([
                    word, {
                        "NamedEntityTag": str(name_entity[1]),
                        "CharacterOffsetEnd": str(end),
                        "CharacterOffsetBegin": str(begin),
                        "PartOfSpeech": str(word_posTag),
                        "Lemma": str(word_lemma)
                    }
                ])
            self.parseResult['parseTree'].append(
                self.get_constituency_Tree(sentence))
            self.parseResult['text'].append(sentence)
            self.parseResult['dependencies'].append(
                self.get_dependencies(sentence))
            self.parseResult['words'].append(words_list)

        return self.parseResult
コード例 #19
0
__author__ = 'laceyliu'

parser_path = '/Users/laceyliu/Documents/workspace/WikiQA/stanford-parser-full'
which_java = '/Library/Java/JavaVirtualMachines/jdk1.8.0_11.jdk/Contents/HOME/bin/java'
import os
from nltk.parse.stanford import StanfordParser
os.environ['JAVAHOME'] = which_java
os.environ['CLASSPATH'] = parser_path
os.environ['STANFORD_MODELS'] = parser_path
sentence = "hello world"
sp = StanfordParser()

sentences = [
    'Clinton Drew \"Clint\" Dempsey (born March 9, 1983) is an American soccer player who plays for Tottenham Hotspur and the United States national team.',
    'Growing up in Nacogdoches, Texas, Dempsey played for one of the top youth soccer clubs in the state, the Dallas Texans, before playing for Furman University\'s men\'s soccer team. ',
    'In 2004, Dempsey was drafted by Major League Soccer club New England Revolution, where he quickly integrated himself into the starting lineup. ',
    'Hindered initially by a jaw injury, he would eventually score 25 goals in 71 appearances with the Revolution.',
    'Between 2007 and 2012, Dempsey played for Premier League team Fulham and is the club\'s highest Premier League goalscorer of all time.',
    'Dempsey first represented the United States at the 2003 FIFA World Youth Championship in the United Arab Emirates. He made his first appearance with the senior team on November 17, 2004, against Jamaica; he was then named to the squad for the 2006 World Cup and scored the team\'s only goal of the tournament. ',
    'In the 2010 FIFA World Cup, Dempsey scored against England, becoming the second American, after Brian McBride, to score goals in multiple World Cup tournaments.'
]

ss2 = []
for s in sentences:
    if s.count(' ') < 20 and s.count(' ') > 7:
        ss2.append(s.decode('utf-8').encode('ascii', 'ignore'))
trees = sp.raw_parse_sents(ss2)
for t in trees:
    print list(t)
コード例 #20
0
#     for c in node:
#         if isinstance(c, Tree):
#             for p in c.productions():
#                 p = str(p)
#                 if '\'' in p:
#                     p = p.replace('\'', '')
#                     if not p in lexicons:
#                         lexicons[p] = 0
#                     lexicons[p] += 1
#                     print("[%s]", p)
#                 else:
#                     if not p in productions:
#                         productions[p] = 0
#                     productions[p] += 1
#                     print("[%s]", p)
#             if c.height() > 2:
#                 collect_productions(c)

parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
with open("../temp/examples.sen") as f:
    trees = parser.raw_parse_sents(f.read().split("\n"))  # @UndefinedVariable
    for t in trees:
        t = t.next()
#         print("_".join(t.leaves()))
        collect_productions(t)

for k in sorted(productions):
    print(k + " " + str(productions[k]))
for k in sorted(lexicons):
    print(k + " " + str(lexicons[k]))
コード例 #21
0
ファイル: main.py プロジェクト: 5aurabhpathak/all-I-ve-done
from nltk.tokenize import sent_tokenize
from nltk import download
from nltk.tree import ParentedTree
import os

#download('punkt', quiet=True)
#download('names', quiet=True)

os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09')

text = input('Enter some text:')

tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))]

tlist2 = [tree.copy(True) for tree in tlist]
from hobbs import *
from lappinleasse import *

print('Input text was:\n', text)
def resolve(ls, algo):
    print('\nResolving with', algo)
    i = -1
    for parsetree in ls:
        i += 1
        print("processing sentence {}...".format(i+1))
        if algo == "Hobb's algorithm": hobbs(parsetree, i, ls)
        else: lappinleasse(parsetree, i)
コード例 #22
0
class SyntacticExtractor(SentenceExtractor):
    """ Tries to split sentences into sub-sentences so that each of them
        contains only one LU
    """

    splitter = None
    parser = None
    token_to_lemma = None
    all_verbs = None

    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar',
                                     path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
                                     java_options=' -mx2G -Djava.ext.dirs=dev/')

        self.token_to_lemma = {}
        for lemma, tokens in self.lemma_to_token.iteritems():
            for t in tokens:
                self.token_to_lemma[t] = lemma
        self.all_verbs = set(self.token_to_lemma.keys())

    def extract_from_item(self, item):
        extracted = []
        bio = item.get(self.document_key, '').lower()
        url = item.get('url')
        if not bio or not url:
            logger.warn('skipping item without url or bio')
            return

        try:
            roots = self.parser.raw_parse_sents(self.splitter.split(bio))
        except (OSError, UnicodeDecodeError):
            logger.exception('cannot parse biography, skipping')
            return

        for root in roots:
            root = root.next()
            try:
                sub_sents = self.find_sub_sentences(root)
            except:
                logger.exception('cannot find sub-sentences')
                continue

            for sub in sub_sents:
                try:
                    text = ' '.join(chunk for _, chunk in self.find_terminals(sub))
                    logger.debug('processing text ' + text)
                    verbs = set(chunk for _, chunk in self.find_terminals(sub, 'V'))
                except:
                    logger.exception('cannot extract verbs or parse sentence')
                    continue

                found = verbs.intersection(self.all_verbs)

                if len(found) == 0:
                    logger.debug('No matching verbs found in sub sentence')
                elif len(found) == 1:
                    extracted.append({
                        'lu': self.token_to_lemma[found.pop()],
                        'text': text,
                        'url': url,
                    })
                else:
                    logger.debug('More than one matching verbs found in sentence %s: %s',
                                 text, repr(found))

        if extracted:
            logger.debug("%d sentences extracted...", len(extracted))
            return item, extracted
        else:
            logger.debug("No sentences extracted. Skipping the whole item ...")

    def find_sub_sentences(self, tree):
        # sub-sentences are the lowest S nodes in the parse tree
        if not isinstance(tree, Tree):
            return []

        s = reduce(lambda x, y: x + y, map(self.find_sub_sentences, iter(tree)), [])
        if tree.label() == 'S':
            return s or [tree]
        else:
            return s

    def find_terminals(self, tree, label=None):
        # finds all terminals in the tree with the given label prefix
        if len(tree) == 1 and not isinstance(tree[0], Tree):
            if label is None or tree.label().startswith(label):
                yield (tree.label(), tree[0])
        else:
            for child in tree:
                for each in self.find_terminals(child, label):
                    yield each
コード例 #23
0
    "stanford-pos-tagger/stanford-postagger-full-2018-10-16/" + \
    "stanford-postagger.jar"
cale_parser = os.environ['STANFORD_PARSER']
cale_modele = os.environ['STANFORD_MODELS']

tagger = StanfordPOSTagger(cale_model, cale_jar_tagger)
parser = StanfordParser(model_path="/home/t3rtius/stanford-parser/" +
                        "stanford-parser-full-2018-10-17/englishPCFG.ser.gz")
dependency_parser = StanfordDependencyParser(path_to_jar=cale_parser,
                                             path_to_models_jar=cale_modele)

propsIn = open("2-props.txt", "r")
propsOut = open("2-props-out.txt", "w")
propsInText = propsIn.read()
sents = nltk.sent_tokenize(propsInText)
parsed = parser.raw_parse_sents(sents)

count = 1
constituenti = []
dependente = []

for propL in parsed:
    for prop in propL:
        constituenti.append(str(prop))

for prop in sents:
    deps = dependency_parser.raw_parse(str(prop))
    for dep in deps:
        dependente.append(str(list(dep.triples())))

for prop in sents:
コード例 #24
0
__author__ = 'laceyliu'

parser_path ='/Users/laceyliu/Documents/workspace/WikiQA/stanford-parser-full'
which_java = '/Library/Java/JavaVirtualMachines/jdk1.8.0_11.jdk/Contents/HOME/bin/java'
import os
from nltk.parse.stanford import StanfordParser
os.environ['JAVAHOME'] =  which_java
os.environ['CLASSPATH'] = parser_path
os.environ['STANFORD_MODELS'] = parser_path
sentence = "hello world"
sp=StanfordParser()

sentences = ['Clinton Drew \"Clint\" Dempsey (born March 9, 1983) is an American soccer player who plays for Tottenham Hotspur and the United States national team.',
             'Growing up in Nacogdoches, Texas, Dempsey played for one of the top youth soccer clubs in the state, the Dallas Texans, before playing for Furman University\'s men\'s soccer team. ',
             'In 2004, Dempsey was drafted by Major League Soccer club New England Revolution, where he quickly integrated himself into the starting lineup. ',
             'Hindered initially by a jaw injury, he would eventually score 25 goals in 71 appearances with the Revolution.',
             'Between 2007 and 2012, Dempsey played for Premier League team Fulham and is the club\'s highest Premier League goalscorer of all time.',
             'Dempsey first represented the United States at the 2003 FIFA World Youth Championship in the United Arab Emirates. He made his first appearance with the senior team on November 17, 2004, against Jamaica; he was then named to the squad for the 2006 World Cup and scored the team\'s only goal of the tournament. ',
             'In the 2010 FIFA World Cup, Dempsey scored against England, becoming the second American, after Brian McBride, to score goals in multiple World Cup tournaments.']

ss2 = []
for s in sentences:
    if s.count(' ') < 20 and s.count(' ') > 7:
        ss2.append(s.decode('utf-8').encode('ascii', 'ignore'))
trees = sp.raw_parse_sents(ss2)
for t in trees:
    print list(t)
コード例 #25
0
split = ["trial", "train", "test_annotated"]
for s in split:
    f = open("SICK_" + s + ".txt", "r")
    lines = f.readlines()
    sentences = []
    labels = []
    for i in range(1, len(lines)):
        a = lines[i].split("\t")
        sentences.extend([a[1], a[2]])
        labels.extend([a[3], a[3]])

    parser = StanfordParser(
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    stanford_dir = parser._classpath[0].rpartition('/')[0]
    parser._classpath = tuple(find_jars_within_path(stanford_dir))
    parser.java_options = '-mx5000m'  # To increase the amount of RAM it can use.
    #a=[parse.tree()._pformat_flat("","()",False) for parse in parser.raw_parse("The young boys are playing outdoors and the man is smiling nearby")]
    a = [[parse for parse in dep_graphs]
         for dep_graphs in parser.raw_parse_sents(sentences)]
    file = open("SICK_cons_parse_" + s + ".txt", "w")
    for i in range(len(a)):
        for j in range(len(a[i])):
            a[i][j].chomsky_normal_form(horzMarkov=1)
            a[i][j].collapse_unary(collapsePOS=True)
            d = a[i][j]._pformat_flat("", "()", False)
            sent1 = d.replace("ROOT", labels[i], 1)
            file.write(sent1 + "\n")
    file.close()
    f.close()
コード例 #26
0
ファイル: parse.py プロジェクト: michal-au/article-prediction
    stanford_parser_dir + '/slf4j-api.jar', stanford_parser_dir +
    '/slf4j-simple.jar'
])

for r, ds, fs in os.walk(heldout_raw_path):
    ds.sort()
    fs.sort()
    file_counter = 0
    already_parsed = os.listdir(heldout_parse_path)
    files = [
        f for f in fs if f[:1] in ('E', 'F', 'G') and f not in already_parsed
    ]

    files_count = len(files)
    for f in files:
        file_counter += 1
        print f, file_counter / float(files_count)

        in_path = os.path.join(r, f)
        with codecs.open(in_path, 'r', 'utf-8') as fl:
            sents = [l for l in fl if len(l.split()) <= MAXLENGTH]

        trees = parser.raw_parse_sents(sents)

        out_path = os.path.join(heldout_parse_path, f)
        utils.create_dir_for_file(out_path)
        with codecs.open(out_path, 'w', 'utf-8') as fl:
            for t in trees:
                for t_ in t:
                    print >> fl, ' '.join(unicode(t_).split())
コード例 #27
0
        input_pos = nltk.pos_tag(input_bag)  # Tuple (word, POS)

        # Remove stop words
        input_sw_removed = [
            w for w in input_bag if w.lower() not in stop_words
        ]

        # Stem (As feature) & Lemmatize (As feature)
        input_stemmed = []
        input_lemmatized = []
        for word in input_bag:
            input_stemmed.append(stemmer.stem(word))
            input_lemmatized.append(lemmatizer.lemmatize(word))

        # Tree Parse (As feature)
        input_tree = parser.raw_parse_sents(input_sents)

        # WordNet hypernymns, hyponyms, meronyms, AND holonyms (As feature)
        input_hypernymns = []
        input_hyponyms = []
        input_meronyms = []
        input_holonyms = []
        input_bag_counter = Counter(input_sw_removed)
        for word in input_bag_counter.keys():
            synsets = wn.synsets(word)
            if synsets:
                max_cos = 0.0
                target_synset = None
                for synset in synsets:
                    definition = synset.definition()
                    cos = get_cosine(Counter(input_bag), Counter(definition))