Ejemplo n.º 1
0
class DependencyParser():
    def __init__(self):

        path2jar = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar'
        path2model = '/home/bendan0617/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar'
        self.dep_parser = StanfordDependencyParser(path_to_jar=path2jar, path_to_models_jar=path2model, java_options='-mx100g')

    def parse_sents(self, sents):
        """
        Parameters:
        sents: list of string

        Reutrns: list of list of triples
        """
        parsed_sents = self.dep_parser.raw_parse_sents(sents)
        return [[list(parse.triples()) for parse in parsed_sent]for parsed_sent in parsed_sents]

    def get_SVOM(self, sents):
        parsed_sents = self.parse_sents(sents)
        output=[]
        for sent in parsed_sents:
            tmp={'V':('<empty>','<empty>'), 'S':('<empty>','<empty>'),
                    'O':('<empty>','<empty>'), 'M':('<empty>','<empty>')}
            for triple in sent[0]:
                t1, t2, t3 = triple[0], triple[1], triple[2]
                if t2[0:5]=='nsubj' and t1[1][0]=='V':
                    if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1
                    if tmp['S'][0]=='<empty>': tmp['S']=t3
                elif t2=='nsubj' and t1[1][0] in 'VJNP':
                    if tmp['O'][0]=='<empty>': tmp['O']=t1
                    if tmp['S'][0]=='<empty>': tmp['S']=t3
                elif t2=='cop':
                    if tmp['O'][0]=='<empty>': tmp['O']=t1
                    if tmp['V'][0]=='<empty>': tmp['V']=t3
                elif t2=='dobj':
                    if tmp['V'][0]=='<empty>': tmp['V']=t1
                    if tmp['O'][0]=='<empty>': tmp['O']=t3
                elif t2=='ccomp' or t2=='iobj' or t2=='pobj' or t2=='xcomp':
                    #if tmp['S'][0]=='<empty>':
                       # tmp['S']=t3
                    if tmp['M'][0]=='<empty>':
                        tmp['M']=t3
                elif t2 == 'auxpass':
                    if tmp['V'][0]=='<empty>': tmp['V']=t1
                    if tmp['S'][0]!='<empty>':
                        tmp['O']=tmp['S']
                        tmp['S']=('<empty>','<empty>')
                #elif t2[0:3] == 'acl':
                #    if tmp['S'][0]=='<empty>': tmp['S']=t1
                elif t2[0:4] == 'nmod':
                   # if tmp['V'][0]=='<empty>' and t1[1][0] =='V': tmp['V']=t1
                    if tmp['O'][0]=='<empty>': tmp['O']=t3
                elif t2 == 'dep':
                    if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['M']=t1
                #elif t2 == 'xcomp':
                 #   if tmp['S'][0]=='<empty>' and t1[1][0] != 'V' : tmp['S']=t1
                else:
                    continue
            output.append([tmp['S'], tmp['V'], tmp['O'], tmp['M']])
        return output, parsed_sents
def findDependencies_batched(sentences):
	all_pos_tagging = []
	all_roots = []
	all_dependencyList = []
	all_Words = []
	try :
		dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
		results = dependency_parser.raw_parse_sents(sentences)
		results = list(results)
		if(len(results) != len(sentences)):
			print("#######WARNINING: Len(results) != Len(sentences) - ",len(results), len(sentences))
		for parsetree in results:
		    pos_tagging, roots, dependencyList, Words = findDependencies(list(parsetree)[0])
		    all_pos_tagging.append(pos_tagging)
		    all_roots.append(roots)
		    all_dependencyList.append(dependencyList)
		    all_Words.append(Words)
	except :
	   print("Error in parsing the tree")
		# exit(-1)

	if len(all_pos_tagging) != len(sentences):
	    print("#####WARNINING: Len(all_pos_tagging) < Len(sentences) - ",len(all_pos_tagging), len(sentences))

	while(len(all_pos_tagging) < len(sentences)):
	    all_pos_tagging.append([])
	    all_roots.append([])
	    all_dependencyList.append([])
	    all_Words.append([])

	return all_pos_tagging, all_roots, all_dependencyList, all_Words
def get_depG():
    """
    this function used for testing purpose

    Returns:
        (dependency graph) for testing
    """
    os.environ[
        'CLASSPATH'] = '/Users/zarzen/Development/stanford-parser-full-2015-12-09'
    dep_parser = StanfordDependencyParser(
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

    tt = '''
    The programs that come standard with the Leopard running system are enough for the average person to run all the basics.
    '''
    tt = tt.strip()
    tt = tt.replace('\n', ' ')
    sents = seg_text(tt)
    depgs = dep_parser.raw_parse_sents(sents)
    depgs = list(depgs)
    depG = []
    g = depgs[-1]
    g = list(g)
    g = g[0]
    for node_idx in g.nodes:
        node_dict = g.nodes[node_idx]
        depG.append(node_dict)
    return depG
Ejemplo n.º 4
0
    def bodt_features(self, texts):
        path_to_jar = 'stanford-parser-full-2017-06-09/stanford-parser.jar'
        path_to_models_jar = 'stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'
        dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        t = [s for (s1, s2) in texts for s in (s1, s2)]
        dependency_triples = []

        # for s in t:
        #     print(2s)
        #     print(dependency_parser.raw_parse(s))

        for res in dependency_parser.raw_parse_sents(t):
            dependency_triples += next(res).triples(),

        if "bodt.tfidf" not in self.model:
            vec = TfidfVectorizer(lowercase=False,
                                  analyzer=self._dependency_triple_analyzer)
            s = vec.fit_transform(dependency_triples)
            self.model["bodt.tfidf"] = vec
            pickle.dump(self.model, open("model.pkl", "wb"))
        else:
            vec = self.model["bodt.tfidf"]
            s = vec.transform(dependency_triples)

        return s
Ejemplo n.º 5
0
def depParser(sentence):
    word = ''.join(sentence)

    english_parser = StanfordDependencyParser(
        './resources/stanford-parser-3.4.1-models.jar')
    result = [
        list(parse.triples()) for parse in english_parser.raw_parse_sents(word)
    ]

    return result
Ejemplo n.º 6
0
    def extract_events2(self, tweet_sentences):
        path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        sentence_preprocessor = Preprocessor(['remove_non_letters'])
        ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger)
        dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        events = []

        chunks = list(
            self.utilities.chunkify_list(data_list=tweet_sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            created_ats = []
            sentences = []
            for chunk_item in chunk:
                created_ats.append(chunk_item[0])
                sentences.append(
                    sentence_preprocessor.preprocess(chunk_item[1]))

            chunk_sent_dependencies = dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, created_at in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, created_ats):
                dependencies = [
                    list(parse.triples()) for parse in sent_dependencies
                ]

                if len(dependencies) > 0 and dependencies[0] is not None:
                    sentence_events = self.extract_events_from_stanford_dependencies(
                        dependencies[0], sent_ner_tags)
                    if len(sentence_events) > 0:
                        for sentence_event in sentence_events:
                            events.append((created_at, sentence_event))

        return events
Ejemplo n.º 7
0
def write_dependency_rule_by_line(file_name):
	from nltk.parse.stanford import StanfordDependencyParser
	jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser.jar'
	models_jar = 'lib/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'
	dependency_parser = StanfordDependencyParser(path_to_jar = jar, path_to_models_jar = models_jar, java_options='-mx3000m')

	all_relations = read_data_utf8(file_name)

	print( 'len of all relations: %d' % (len(all_relations)) )
	sentences = []
	lineno = 0
	line_interval = []
	for idx, relation in enumerate(all_relations):
		_from = lineno

		lines = []
		sent = []
		if '.' in relation['Arg1']['Lemma']:
			for word in relation['Arg1']['Lemma']:
				if word == '.':
					lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
					sent = []
				else:
					sent.append(word)
			lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
		else:
			lines.append(' '.join(relation['Arg1']['Lemma']).encode('utf8').replace('\xc2\xa0', ''))
		
		_to = _from + len(lines)

		sentences += lines
		lines = []
		sent = []
		if '.' in relation['Arg2']['Lemma']:
			for word in relation['Arg2']['Lemma']:
				if word == '.':
					lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
					sent = []
				else:
					sent.append(word)
			lines.append(' '.join(sent).encode('utf8').replace('\xc2\xa0', ''))
		else:
			lines.append(' '.join(relation['Arg2']['Lemma']).encode('utf8').replace('\xc2\xa0', ''))

		_to += len(lines)
		sentences += lines
		lineno = _to
		line_interval.append( (_from, _to ) )
	pass
	for idx, pair in enumerate(line_interval):
		print( '(%d:%d)' % (pair[0],pair[1]) )
		for i in range(pair[0],pair[1]):
			print( '%d:%s' % (i,sentences[i]) )
	
	print( 'len of sentences: %d' % ( len(sentences) ) )

	line_interval_idx = 0
	count = 0
	'''
		each result is correspoding to a sentence
		a line_interval [from, to)
	'''
	relation_length = len(all_relations)
	all_part = 5
	for part in range(all_part+1):
		_from = part * (relation_length / all_part) # inclusive
		if _from >= relation_length:
			break
		_to = min( (part+1) * (relation_length / all_part) -1, relation_length - 1 ) # inclusive
		print('part %d' % part)
		print('relation %d' % (_to - _from+1))

		to_parse_sentences = sentences[ line_interval[_from][0] : line_interval[_to][1] ]
		print('line of sentences %d' % ( len(to_parse_sentences) ) )

		start = time.time()
		parse_result = dependency_parser.raw_parse_sents(to_parse_sentences)
		end = time.time()
		print( 'cost %f' % (end - start) )

		dep_rule_list = []
		dep_rule_for_one_relation = []
		acutal_result_no = 0
		for result in parse_result:
			acutal_result_no += 1
			for t in result:
				for node in range(len(t.nodes)):
					if t.nodes[node]['word'] == None or t.nodes[node]['deps'].items() == []:
						continue
					else:
						dep_rule_for_one_relation.append( '%s<-%s' % \
							(t.nodes[node]['word'],	' '.join( [ key for key, val in t.nodes[node]['deps'].items() ] )))	
			if count == line_interval[line_interval_idx][1] - 1:
				print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1])
				line_interval_idx += 1
				dep_rule_list.append(dep_rule_for_one_relation)
				dep_rule_for_one_relation = []
			
			count += 1
		print 'actual parse result no : %d' % acutal_result_no
		# last relation
		#print '%d: (%d, %d) finished' % (line_interval_idx, line_interval[line_interval_idx][0], line_interval[line_interval_idx][1])
		#line_interval_idx += 1
		#dep_rule_list.append(dep_rule_for_one_relation)

		write_data = []
		for dep_rules in dep_rule_list:
			write_data.append( '||'.join([rule for rule in dep_rules] ) )

		print('length of  write_data %d' % len(write_data))
		with codecs.open('tmp/dep_rule_%s_part%d.txt'% (file_name, part), 'w', encoding = 'utf-8') as file:
			file.write( u'\n'.join(write_data) )
	pass#for part in range(all_part) end
Ejemplo n.º 8
0
class MainScraper(object):
    """docstring for MainScraper"""
    def __init__(self):
        self.EOS = ['.', '?', '!']
        self.flags = {
            '21': ['-----', 'Servings:', 'Pro-Exchange'],
            '26': ['MMMMM', 'Yield:', 'Recipe'],
            '13': ['-----', 'Yield:', 'Recipe']
        }

    def build_dict(self, key_name):
        from nltk.parse.stanford import StanfordDependencyParser
        core = '/Users/fengwf/stanford/stanford-corenlp-3.7.0.jar'
        model = '/Users/fengwf/stanford/english-models.jar'
        self.parser = StanfordDependencyParser(path_to_jar=core,
                                               path_to_models_jar=model,
                                               encoding='utf8',
                                               java_options='-mx2000m')
        print('Loading data ...')
        data = pickle.load(open('RecipeDatasets/all_mm_recipes.pkl'))
        objs = {}
        adjs = {}
        vbds = {}
        all_sents = []
        print('Processing %s ...' % key_name)
        #ipdb.set_trace()
        for i in tqdm(xrange(len(data))):
            text = data[i]
            sents = [transform_digits(i.lower()) for i in text[key_name]]
            try:
                if key_name == 'Steps':
                    self.parse_steps(sents, all_sents)
                else:
                    self.parse_ingredients(sents, all_sents)
            except AssertionError:
                continue
            except KeyboardInterrupt:
                break
            except:
                continue

        if key_name == 'Steps':
            with open('RecipeDatasets/steps_dependency.pkl', 'w') as f:
                print('\n Saving file ...')
                pickle.dump(all_sents, f)
                print(' Success!\n')
        else:
            with open('RecipeDatasets/obj_dict.pkl', 'w') as f:
                print('\n Saving file ...')
                pickle.dump(
                    {
                        'objs': objs,
                        'adjs': adjs,
                        'vbds': vbds,
                        'all_sents': all_sents
                    }, f)
                print(' Success!\n')

    def parse_ingredients(self, sents, all_sents):
        dep = self.parser.raw_parse_sents(sents)
        for ind in xrange(len(sents)):
            concurrent_sent = [[], [], []]  # NN, JJ, VBD/VBN/VBG
            lines = [
                l.split()
                for l in str(dep.next().next().to_conll(10)).split('\n')
            ]
            for line in lines:
                try:
                    ind, word, pos, component = line[0], line[1], line[
                        3], line[7]
                    if len(word) <= 2:  # words of units (e.g. x, T, ds etc.)
                        continue
                    if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
                        concurrent_sent[0].append(word)
                        if word in objs:
                            objs[word] += 1
                        else:
                            objs[word] = 1
                    elif pos in ['JJ', 'JJR', 'JJS']:
                        concurrent_sent[1].append(word)
                        if word in adjs:
                            adjs[word] += 1
                        else:
                            adjs[word] = 1
                    elif pos in ['VBD', 'VBN', 'VBG']:
                        concurrent_sent[2].append(word)
                        if word in vbds:
                            vbds[word] += 1
                        else:
                            vbds[word] = 1
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except:  # end of the line or not enough components
                    continue
            all_sents.append(concurrent_sent)

    def parse_steps(self, sents, all_sents):
        # save all dependency results of text['Steps'] to file
        dep = self.parser.raw_parse_sents(sents)
        dep_list = []
        #words_list = []
        for ind in xrange(len(sents)):
            lines = [
                l.split()
                for l in str(dep.next().next().to_conll(10)).split('\n')
            ]
            lines = filter_empty(lines)
            #words = [' '] * (int(lines[-1][0]) + 1)
            dependency = []
            for line in lines:
                try:
                    dependency.append(
                        [line[0], line[1], line[3], line[6], line[7]])
                    #words[int(line[0])] = line[1]
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except:  # end of the line or not enough components
                    continue
            dep_list.append(dependency)
            #words_list.append(words)
        #all_sents.append({'words': words_list, 'dep': dep_list})
        all_sents.append(dep_list)

    def convert_texts(self, filename, output=[], outfile='', save_file=False):
        # convert *.mmf file to structured data={Title, Categories, Yield, Ingredients, Steps}
        #ipdb.set_trace()
        print('Processing file: %s' % filename)
        data = open(filename).read().strip()
        data = re.sub(r'[\x14\+\*\~\#]+', '',
                      data)  # remove the explanation marks
        wrong_text_flag = False
        # comfirm spliter, yield_flag, start_flag
        if data.startswith('---------- Pro-Exchange'):
            spliter, yield_flag, start_flag = self.flags['21']
        elif data.startswith('---------- Recipe'):
            spliter, yield_flag, start_flag = self.flags['13']
        elif data.startswith('MMMMM----- Recipe'):
            spliter, yield_flag, start_flag = self.flags['26']
        else:
            print('\n Wrong file type!\n')
            #ipdb.set_trace()
            lines = filter_empty(
                [t.strip() for t in re.split(r'[\r\n]', data)])
            spliter = '-----'
            start_flag = filter_empty(lines[0].split(spliter))[0].strip()
            yield_flag = lines[3].split()[0]
            wrong_text_flag = True
            #return output

        texts = data.split(spliter)
        texts = filter_line(texts)
        texts = [
            filter_empty([s.strip() for s in re.split(r'[\r\n]', t)])
            for t in texts
        ]
        texts = filter_empty(texts)

        #
        text_ind = len(texts) - 1
        while text_ind > 0:
            # read from back to front, start_flag is a flag indicating the start of a recipe
            try:
                text = texts[text_ind]
                while not text[0].startswith(start_flag) and text_ind > 0:
                    text_ind -= 1
                    text = texts[text_ind] + text
                #if wrong_text_flag:
                #    text = filter_only_line(text)

                Title = filter_line(text[1].split('Title:')[-1]).strip()
                Categories = [
                    c.strip()
                    for c in text[2].split('Categories:')[1].split(',')
                ]
                Categories = filter_empty(filter_line(Categories))
                Yield = filter_line(text[3].split('%s' %
                                                  yield_flag)[-1]).strip()

                ind = 4
                Ingredients = []
                max_sent_ind = len(text) - 1
                mater = filter_line(text[ind])
                while isIngredient(
                        mater):  #mater[0].isdigit() or isIngredient(mater):
                    #if len(mater) >= 2 and mater[1] == '.': # these are sentences of steps
                    #    break
                    if mater[0].isdigit() and ind < max_sent_ind:
                        next_line = filter_line(text[ind + 1])
                        if not next_line[0].isdigit() and isIngredient(
                                next_line):
                            ind += 1
                            mater = mater + ' ' + filter_line(text[ind])
                    if len(mater) > 1 and mater[-1] != ':':
                        Ingredients.append(mater)
                    if ind < max_sent_ind:
                        ind += 1
                        mater = filter_line(text[ind])
                    else:
                        break

                sent = ''
                Steps = []
                while ind <= max_sent_ind:
                    sent = text[
                        ind]  # some sentences are split by \n becuase it's too long
                    while sent[-1] not in self.EOS and ind < max_sent_ind:
                        ind += 1
                        sent = sent + ' ' + text[ind]  # join them together

                    if isEndOfSent(sent) and len(Steps) > 0:
                        break
                    sent = filter_line(sent)
                    sents = filter_empty(
                        [s.strip() for s in re.split(r'[\?\!\.]', sent)])
                    Steps.extend(sents)
                    ind += 1
                if len(Steps) > 0:
                    output.append({
                        'Title': Title,
                        'Categories': Categories,
                        'Yield': Yield,
                        'Ingredients': Ingredients,
                        'Steps': Steps
                    })
                    #print('text_ind: %d \t len(output): %d' % (text_ind, len(output)))
                else:
                    ipdb.set_trace()
            except Exception as e:
                #print(e)
                pass

            text_ind -= 1

        #ipdb.set_trace()
        print('text_ind: %d \t len(output): %d' % (text_ind, len(output)))
        if save_file:  # save data from different *.mmf files to a single file
            if outfile:
                filename = outfile
            print('Saving file ...')
            with open('%s.pkl' % filename, 'w') as f:
                pickle.dump(output, f)
            with open('%s.txt' % filename, 'w') as f:
                for t in output:
                    f.write('Title: {}\nCategories: {}\nYield: {}\n'.format(
                        t['Title'], ', '.join(t['Categories']), t['Yield']))
                    f.write('Ingredients: \n\t{}\nSteps: \n\t{}\n\n'.format(
                        '\n\t'.join(t['Ingredients']),
                        '\n\t'.join(t['Steps'])))
            print('Success!\n')

        return output

    def convert_texts_main(self, convert_mode):
        output = []
        home = 'RecipeDatasets/mmf_files/'
        outfile = 'RecipeDatasets/%s_recipes' % convert_mode
        if convert_mode == 'all':
            files = [f for f in os.listdir(home) if f.endswith('.mmf')]
            max_file_ind = len(files) - 1
            for i, name in enumerate(files):
                save_file = False if i < max_file_ind else True
                output = self.convert_texts(home + name, output, outfile,
                                            save_file)
        else:
            for c in 'abcdefghijk':
                output = self.convert_texts('Mm13000%s.mmf' % c, output,
                                            outfile)
            output = self.convert_texts('mm2155re.mmf', output, outfile)
            output = self.convert_texts('misc2600.mmf',
                                        output,
                                        outfile,
                                        save_file=True)

    def load_driver(self):
        from selenium import webdriver
        self.driver = webdriver.Chrome('~/Desktop/chromedriver')

    def get_text_from_page(self, url):
        self.driver.get(url)
        elements = self.driver.find_elements_by_xpath('//tr/td')
        if len(elements) >= 2:
            text = [t.strip() for t in e[1].text.split('\n')]
            text = filter_empty(text)
            assert text[1].startswith('MMMMM')
            Title = text[0]
            Categories = filter_empty(
                text[3].split('Categories:')[1].split(','))
            Yield = text[4].split('Yield: ')[-1]

        ind = 5
        Ingredients = []
        while isdigit(text[ind][0]):
            Ingredients.append(text[ind])
            ind += 1

        sent = ''
        num_sents = len(text)
        Steps = []
        while ind < num_sents - 1:
            sent = text[ind]
            while sent[-1] not in self.EOS and ind < num_sents:
                ind += 1
                sent += text[ind]
            sents = filter_empty(re.split(r'[\?\!\.]', sent))
            Steps.extend(sents)
        assert text[-1].endswith('MMMMM')

        return {
            'Title': Title,
            'Categories': Categories,
            'Yield': Yield,
            'Ingredients': Ingredients,
            'Steps': Steps
        }
Ejemplo n.º 9
0
from nltk.parse.stanford import StanfordDependencyParser
import os

#set-up
java_path = r"path\to\java"
os.environ["JAVAHOME"] = java_path

#load_model
path_to_jar = r"path\to\stanford-parser.jar"
path_to_models_jar = r"path\to\stanford-parser-3.8.0-models.jar"
stanford_parser = StanfordDependencyParser(path_to_jar=path_to_jar, 
	path_to_models_jar=path_to_models_jar, encoding="utf-8")

def parser(sent_list): #input: list of sentences
	"""
    This function takes a list of sentences and detect whether each sentence is written in passive or active voice.
    This function only notifies for a fix if the sentence is passive. 
    """
    text = stanford_parser.raw_parse_sents(sent_list)
    #Extract feature from the Dependency Graph. Documentation: http://www.nltk.org/_modules/nltk/parse/dependencygraph.html
    for f in list(text):
        for w1, rel, w2 in next(f).triples():
            if rel == "nsubjpass":
                print(w2[0], w1[0])
Ejemplo n.º 10
0
class Parser(object):
    def __init__(self,
                 datasetName,
                 path_to_models_jar=path_to_models_jar,
                 path_to_jar=path_to_jar,
                 path_to_save='/users/ud2017/hoavt/nli/BiMPM/models'):
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar,
            path_to_models_jar=path_to_models_jar,
            java_options='-mx20000m')
        self.path_to_save = path_to_save
        self.cache = {}
        self.datasetName = datasetName
        self.load_cache()
        #types = "acomp advcl advmod agent amod appos aux auxpass cc ccomp conj cop csubj csubjpass \
        #        dep det discourse dobj expl goeswith iobj mark mwe neg nn npadvmod nsubj nsubjpass \
        #        num number parataxis pcomp pobj poss possessive preconj predet prep prepc prt punct \
        #        quantmod rcmod ref root tmod vmod xcomp xsubj nmod"
        types = "acl acl:relcl advcl advmod amod appos aux auxpass case cc cc:preconj ccomp compoun \
                compound:prt conj cop csubj csubjpass dep det det:predet discourse dislocated dobj \
                expl foreign goeswith iobj list mark mwe name neg nmod nmod:npmod nmod:poss nmod:tmod \
                nsubj nsubjpass nummod parataxis punct remnant reparandum root vocative xcomp compound"

        self.type2idx = defaultdict(lambda: len(self.type2idx))
        for t in types.strip().split():
            self.type2idx[t.strip()]
        self.typesize = len(self.type2idx)
        print "typesize: ", self.typesize

    def isParsed(self, sentence):
        return self.cache and sentence in self.cache

    def parse_sentences(self, sentences):
        results = self.dependency_parser.raw_parse_sents(sentences)
        results = list(results)
        for idx, result in enumerate(results):
            self.parse(sentences[idx], list(result)[0])

    def parse(self, sentence, result=None):
        if sentence in self.cache:
            return self.cache[sentence]
        print 'not found in cache: ', sentence
        if not result:
            result = self.dependency_parser.raw_parse(sentence)
            dep_res = result.next()
            nodes = dep_res.nodes
        else:
            nodes = result.nodes
        parsed_sent = self.emptylistmaker(len(
            sentence.split()))  #[[0...0],[0...0], ...]
        dep_cons = self.neglistmaker(len(sentence.split()))  #[-1, -1 ... -1]
        #print nodes, len(nodes), len(parsed_sent), len(sentence.split())
        for idx in range(len(nodes)):
            try:
                node = nodes[idx]
                if idx == 0:
                    dep_idx = node['deps']['root'][0]
                    dep_type_idx = self.type2idx['root']
                    root = parsed_sent[dep_idx - 1]
                    root[dep_type_idx] = 1
                    parsed_sent[dep_idx - 1] = root
                    # for connection
                    dep_cons[dep_idx - 1] = -1
                    continue
                head = parsed_sent[idx - 1]
                for dep in node['deps']:  # nsubj: [5]
                    try:
                        dep_type_idx = self.type2idx[dep]
                        dep_idx = node['deps'][dep][0]
                        #print 'word:', node['word'], 'idx:', idx, 'type:', dep, 'dep_type_idx:', dep_type_idx, 'dep_idx:', dep_idx
                        dependent = parsed_sent[dep_idx - 1]
                        dependent[dep_type_idx] = -1
                        head[dep_type_idx] = 1
                        #print head
                        #print dependent
                        parsed_sent[idx - 1] = head
                        parsed_sent[dep_idx - 1] = dependent
                        #add dependency connection
                        dep_cons[dep_idx - 1] = idx - 1

                    except Exception as e:
                        print(list(dep_res.triples()))
                        print str(e)
                        print sentence
                        print 'word:', node[
                            'word'], 'idx:', idx, 'type:', dep, 'dep_type_idx:', dep_type_idx, 'dep_idx:', dep_idx
                        print node['deps']
                        print nodes
                        print len(nodes)
                        print len(parsed_sent)
            except Exception as e:
                print str(e)
                print sentence
        results = {'emb': parsed_sent, 'con': dep_cons}
        self.cache[sentence] = results
        return results

    def load_cache(self):
        print "loading dependency cache"
        #import glob
        #for jfile in glob.glob(self.path_to_save + '/' + self.datasetName + '_*.json'):
        #    print jfile
        #    with open(jfile) as f:
        #        cache = json.load(f)
        #        self.cache = dict(self.cache.items() + cache.items())

        if not os.path.isfile(self.path_to_save + '/' + self.datasetName +
                              '.json'):
            return
        with open(self.path_to_save + '/' + self.datasetName + '.json') as f:
            self.cache = json.load(f)

    def save_cache(self):
        with open(self.path_to_save + '/' + self.datasetName + '.json',
                  'w') as outfile:
            json.dump(self.cache, outfile)

    def zerolistmaker(self, n):
        listofzeros = [0] * n
        return listofzeros

    def neglistmaker(self, n):
        listneg = [-2] * n
        return listneg

    def emptylistmaker(self, n):
        listofzeros = self.zerolistmaker(self.typesize)
        emptylist = []
        for x in range(n):
            emptylist.append(self.zerolistmaker(self.typesize))
        return emptylist
Ejemplo n.º 11
0
parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

parser = StanfordParser()
dep_parser = StanfordDependencyParser()

# see parse methods:
# raw_parse
# raw_parse_sents
# parse
# parse_one
# parse_all
# parse_sents
# use _sents for performance

sentence = 'This sentence is a test sentence for test in a test environment.'

for parse in parser.raw_parse(sentence):
    print parse
print

for parse in dep_parser.raw_parse(sentence):
    print parse.tree()
print

for sent in dep_parser.raw_parse_sents([sentence]):
    for parse in sent:
        for tri in parse.triples():
            print tri
    print
import os

os.environ['STANFORD_PARSER'] = './../lib/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = './../lib/stanford-parser-3.9.1-models.jar'

# stanford_tokenizer = StanfordTokenizer(path_to_jar='./../lib/stanford-parser.jar')
#
# print(stanford_tokenizer.tokenize("My dog also likes eating sausage.\r\n My dog also likes eating sausage."))
# print("token end")

dep_parser = StanfordDependencyParser(
    model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
# parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

# print([parse.tree() for parse in dep_parser.raw_parse("My dog also likes eating sausage.")])
# print([list(parse.triples()) for parse in dep_parser.raw_parse("My P05-1067.1 also likes P05-1067.2 sausage.")])
# print('middle')
# print([list(parse.triples()) for parse in dep_parser.raw_parse("My dog also likes eating sausage.")])

# for sentence in parser.raw_parse("My dog also likes eating sausage."):
#     sentence.draw()
sentences = [
    "Traditional H01-1001.5 use a histogram of H01-1001.7 as the document representation but oral ",
    "communication may offer additional indices such as the time and place of the rejoinder and ",
    "the attendance."
]
a = dep_parser.raw_parse_sents(sentences)
b = list(a)
for raw in a:
    print(raw)
Ejemplo n.º 13
0
class EventDetector:
    def __init__(self):
        self.path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        self.path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        self.path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        self.path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        self.ner_tagger = StanfordNERTagger(self.path_to_ner_model,
                                            self.path_to_ner_tagger)
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=self.path_to_jar,
            path_to_models_jar=self.path_to_models_jar)
        self.lemmatizer = WordNetLemmatizer()
        self.utilities = Utilities()

    def extract_events_from_stanford_dependencies(self, dependencies,
                                                  ner_tags):
        entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION']
        raw_events = {}
        for dependency in dependencies:
            if len(dependency) == 3:
                head = dependency[0]
                relation = dependency[1]
                tail = dependency[2]

                if head[1].startswith('VB'):
                    event_keywords = list(raw_events.keys())
                    event_keyword = self.lemmatizer.lemmatize(
                        head[0].lower(), 'v')
                    if event_keyword not in event_keywords:
                        raw_events[event_keyword] = {}

                    if relation.endswith('subj'):
                        subject_pronoun = [
                            'i', 'you', 'he', 'she', 'we', 'they', 'who'
                        ]
                        subj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in subject_pronoun:
                            subj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    subj_value = ner_tag[1]
                        raw_events[event_keyword]['subj'] = subj_value

                    if relation == 'dobj':
                        objective_pronoun = [
                            'me', 'you', 'him', 'her', 'us', 'you', 'them'
                        ]
                        dobj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in objective_pronoun:
                            dobj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    dobj_value = ner_tag[1]

                        raw_events[event_keyword]['dobj'] = dobj_value

                    if relation == 'compound:prt':
                        raw_events[event_keyword]['prt'] = tail[0]

        event = None
        for verb in list(raw_events.keys()):
            event_info = raw_events[verb]
            if len(verb) < 2 or 'subj' not in list(event_info.keys()) or len(event_info['subj']) < 2 \
                    or 'dobj' not in list(event_info.keys()) or len(event_info['dobj']) < 2:
                continue

            event_info['keyword'] = verb
            event = event_info
            break  # return only one event

        return event

    def extract_soft_events(self, dependency_tree, dependency_relations,
                            ner_tags):

        entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION']
        accepted_relation_keys = [
            'nsubj', 'nsubjpass', 'amod', 'dobj', 'advmod', 'nmod', 'xcomp',
            'compound:prt', 'compound', 'neg'
        ]

        keyword = self.lemmatizer.lemmatize(dependency_tree.label(), 'v')

        event = {'keyword': keyword}
        for dependency_relation in dependency_relations:
            if len(dependency_relation) == 3:
                head = dependency_relation[0]
                relation = dependency_relation[1]
                tail = dependency_relation[2]

                if head[0] == keyword and relation in accepted_relation_keys:
                    event[relation] = self.lemmatizer.lemmatize(
                        tail[0].lower())
        # print(event)
        return event

    def extract_event_from_sentence(self, sentence):
        event = None
        sentence_preprocessor = Preprocessor(['remove_non_letters'])

        processed_sentence = sentence_preprocessor.preprocess(sentence)

        sent_dependencies = self.dependency_parser.raw_parse(
            processed_sentence)
        sent_ner_tags = self.ner_tagger.tag_sents([processed_sentence.split()])
        dependencies = [list(parse.triples()) for parse in sent_dependencies]

        if len(dependencies) > 0 and dependencies[0] is not None:
            event = self.extract_events_from_stanford_dependencies(
                dependencies[0], sent_ner_tags)
        else:
            event['keyword'] = sentence

        return event

    def extract_event_from_sentences(self, sentences):
        events = []
        sentence_preprocessor = Preprocessor(['remove_non_letters'])

        chunks = list(
            self.utilities.chunkify_list(data_list=sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            sentences = []
            for chunk_item in chunk:
                sentences.append(sentence_preprocessor.preprocess(chunk_item))

            chunk_sent_dependencies = self.dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = self.ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, sentence in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, sentences):
                temp_sent_dependencies_1, temp_sent_dependencies_2 = itertools.tee(
                    sent_dependencies, 2)
                dependency_relations = [
                    list(parse.triples()) for parse in temp_sent_dependencies_1
                ]
                dependency_tree = [
                    parse.tree() for parse in temp_sent_dependencies_2
                ][0]

                if len(dependency_relations) > 0 and dependency_relations[
                        0] is not None and len(dependency_relations[0]) > 0:
                    # print(sentence)
                    event = self.extract_soft_events(dependency_tree,
                                                     dependency_relations[0],
                                                     sent_ner_tags)
                else:
                    event = {'keyword': sentence}

                events.append(event)

        return events
Ejemplo n.º 14
0
lemmatizer = WordNetLemmatizer()

i = 0
num_reviews = 100
for review in all_reviews:
    #  if i<=6:
    #      i+=1
    #      continue
    sentences = sentence_break([review])
    print review
    #  print sentences
    sentiment = [0] * 5

    ############################################################
    try:
        parses = dependency_parser.raw_parse_sents(sentences)
    except Exception as e:
        print e
        sentiments.append(sentiment)
        continue
    idx = 0
    for parse in parses:
        dep = parse.next()
        dep = list(dep.triples())
        if not sentences[idx].strip():
            continue
#  neg = False
        for word_pair in dep:
            t = is_valid(word_pair[1], word_pair[0][1], word_pair[2][1])
            if t == -1:
                #  print "INVALID TAG",word_pair