Exemple #1
0
    def load(self):
    
        self.target_nodes = self.config.get('target_nodes', '//utt')    
        self.input_attribute = self.config.get('input_attribute', 'norm_text')
        
        self.merge_clitics = self.config.get('merge_clitics', 'True') ## string, not bool
    
        ## check tools exist:
        corenlp_location = os.path.join(self.voice_resources.path[c.BIN], '..', \
                                                            'corenlp-python', 'corenlp')
        assert os.path.isdir(corenlp_location)
        sys.path.append(corenlp_location)
        from corenlp import StanfordCoreNLP
        corenlp_dir = os.path.join(corenlp_location, '..', 'stanford-corenlp-full-2014-06-16')
        
        ## Each document is to be treated as one sentence, no sentence splitting at all. 
        ## Write config for this if necessary:
        corenlp_conf_name = 'no_sentence_split.properties'
        corenlp_conf_file = os.path.join(corenlp_location, corenlp_conf_name)
        if not os.path.isfile(corenlp_conf_file):
            data = ['annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref', \
                    'ssplit.isOneSentence = true']
            writelist(data, corenlp_conf_file)

        print 'Loading stanford corenlp modules from %s ...'%(corenlp_dir)
        print 'Takes a while (~20-30 seconds)...'
        self.models = StanfordCoreNLP(corenlp_dir, properties=corenlp_conf_name)     
def splitCol8toWords(): 
    ## using corenlp to do split up job
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    
    ## load dataset
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t:
        trainTup = json.load(t)

    fres = open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.txt','w')

    split_res = []
    for num,tup in enumerate(trainTup):
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]['words']
        temp = []
        for s in slist:
            temp.append(s[0])
        split_res.append([tup[0],tup[1],tup[6],temp])
        fres.write(tup[0]+'\t'+tup[1]+'\t'+tup[6]+'\t'+','.join(temp)+'\n')
        print 'No.', num,tup[6]
        print tup[8]
        print [tup[0],tup[1],tup[6],temp]
    ## record new dataset
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json','w') as f:
        json.dump(split_res,f)
    fres.close()
Exemple #3
0
def stanford_parse(corpus_path):
    """Parse a directory (recursively) with the Stanford parser..."""
    import os
    import ast
    try:
        from corenlp import StanfordCoreNLP
    except:
        raise ValueError("CoreNLP not installed.")
    path_part, corpus_name = os.path.split(corpus_path)
    new_corpus_folder = 'parsed_%s' % corpus_name
    new_corpus_path = os.path.join(path_part, new_corpus_folder)
    if not os.path.exists(new_corpus_path):
        os.makedirs(new_corpus_path)
    corenlp = StanfordCoreNLP()
    files = os.listdir(corpus_path)
    for root, dirs, files in os.walk(corpus_path, topdown=True):
        for name in files:
            filepath = os.path.join(root, name)
            f = open(filepath)
            raw = f.read()
            parsed_text = ast.literal_eval(corenlp.parse(raw))
            for index, sent in enumerate(parsed_text['sentences']):
                syntax_tree = sent['parsetree']
                plain_text = sent['text']
            subcorpus_path = os.path.join(new_corpus_path, subcorpus_name)
            if not os.path.exists(subcorpus_path):
                os.makedirs(subcorpus_path)
def afterModifyCol8_splitCol8():
    col8_splitup = []
    with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t:
        trainTup = json.load(t)
    corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1"
    corenlp = StanfordCoreNLP(corenlp_dir)
    for num, tup in enumerate(trainTup):
        print 'No.',num
        print 'orin: ',tup[8]
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        print par
        slist =  par["sentences"][0]['words']
        # print slist
        temp = []
        for s in slist:
            temp.append(s[0])
        col8_splitup.append(temp)
        print temp
        ## check dependencies split
        dlist = par['sentences'][0]['dependencies']
        demp = []
        for d in dlist:
            demp.append(d)
        print demp
        if num == 4:
            break
def afterModifyCol8_splitCol8():
    col8_splitup = []
    with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t:
        trainTup = json.load(t)
    corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1"
    # corenlp_dir = "stanford-corenlp-full-2015-01-29"
    # corenlp_dir = "stanford-corenlp-full-2013-06-20"
    corenlp = StanfordCoreNLP(corenlp_dir)
    # res = corenlp.parse("Bell, a company which is based in LA, makes and distributes computer products. I hate you.")
    # par = json.loads(res)
    # for i in  par["sentences"][0]['dependencies']:
    #     print i
    for num, tup in enumerate([trainTup[1853]]):
        print 'No.',num
        print 'orin: ',tup[8]
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        # print par
        slist =  par["sentences"][0]['words']
        # print slist
        temp = []
        for s in slist:
            temp.append(s[0])
        col8_splitup.append(temp)
        print temp
        ## check dependencies split
        dlist = par['sentences'][0]['dependencies']
        demp = []
        for d in dlist:
            demp.append(d)
        print demp
        if num == 4:
            break
def f1f2f3f4f5f6f7(file_,file2_):
    ## using corenlp to do split up job
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    ## load dataset
    with open(file_) as t:
        trainTup = json.load(t)
    ## data structure to hold fea1 to fea7 a list
    feaLst = []
    for num,tup in enumerate(trainTup):
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        print "No. %d tup in processing.." % (num)
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        print tup[8]
        ## use corenlp to get lemma and pos
        for p,word in enumerate(par["sentences"][0]['words']):
            print str(p)+'th w in tupl '+str(num)
            
            
            tmp = {}
            tmp['Word'] = word[0]
            tmp['Lemma'] = word[1]['Lemma']
            tmp['POS'] = word[1]['PartOfSpeech']
            feaLst.append(tmp)
        ## add pos-1,pos+1,pos-2 and pos+2    
        slen = len(feaLst)
        for ind,val in enumerate(feaLst):
            if (ind-1) >= 0 and (ind-1) <= slen-1:
                val['POS-1'] = feaLst[ind-1]['POS']
            else:
                val['POS-1'] = "NA"

            if (ind+1) >= 0 and (ind+1) <= slen -1:
                val['POS+1'] = feaLst[ind+1]['POS']
            else:
                val['POS+1'] = "NA"

            if (ind-2) >= 0 and (ind-2) <= slen -1:
                val['POS-2'] = feaLst[ind-2]['POS']
            else:
                val['POS-2'] = "NA"

            if (ind+2) >=0 and (ind+2) <= slen -1:
                val['POS+2'] = feaLst[ind+2]['POS']
            else:
                val['POS+2'] = "NA"
        
    for i in feaLst:
        print 'w:',i['Word'],' lemma:',i['Lemma'],' pos-2:',i['POS-2'],' pos-1:',i['POS-1'],' pos:',i['POS'],' pos+1:',i['POS+1'],' pos+2:',i['POS+2']
        
    with open(file2_,'w') as o:
        json.dump(feaLst,o)
    print len(feaLst)
    print len(trainTup)
def corenlpLemmaPOS_stanfordparserDependency_split_equalChecking():
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess", version="3.4.1")
    os.environ["STANFORD_PARSER"] = "stanford-parser-full-2014-08-27/"
    os.environ["STANFORD_MODELS"] = "stanford-parser-full-2014-08-27/"
    parser = stanford.StanfordParser(
        model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    with open("../../dataclean_Nov8_2015/train_afterdataclean_modifiedcleanedTupleNov8.json") as t:
        trainTup = json.load(t)
    for num, tup in enumerate(trainTup):
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        ## use corenlp to split sentence
        print "No.", num
        print tup[8]
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]["words"]
        print slist
        temp = []
        for s in slist:
            temp.append(s[0])
        print temp
        ## use stanfordDependencies to do split sentence
        sentences = parser.raw_parse(tup[8])
        s = ""
        for line in sentences:
            for sentence in line:
                s += str(sentence)

        sent = sd.convert_tree(s)
        print sent
        detemp = []
        for t in sent:
            detemp.append(t[1])
        print detemp
        for di, ti in zip(detemp, temp):
            if di == ti:
                pass
            else:
                if (
                    (ti == "(" and di == "-LRB-")
                    or (ti == ")" and di == "-RRB-")
                    or (ti == "[" and di == "-LSB-")
                    or (ti == "]" and di == "-RSB-")
                ):
                    print "diff in parenthesis"
                    pass
                else:
                    print "{", di, " ,", ti, " }"
def checkCoreNLPSplit_DependencySplit(file_):
    with open(file_) as f:
        tset = json.load(f)
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1')
    os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/'
    os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/'
    parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    
    for num, tup in enumerate(tset):
        print num
        if not tup[8]:
            continue
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]['words']
        temp = []
        for s in slist:
            temp.append(s[0])
        
        ## use stanfordDependencies to do split sentence
        sentences = parser.raw_parse(tup[8])
        s=""
        for line in sentences:
            for sentence in line:
                s+=str(sentence)

        sent = sd.convert_tree(s)
        
        detemp = []
        for t in sent:
            detemp.append(t[1])
            
        ## check if same

        for di,ti in zip(detemp,temp):
            if di == ti:
                pass
            else:
                if (ti == '(' and di == '-LRB-') or (ti == ')' and di == '-RRB-') or (ti == '[' and di == '-LSB-') or (ti == ']' and di == '-RSB-'):
                    print "diff in parenthesis"
                    pass
                else:
                    print "!!!"
                    print "{",di,' ,',ti," }"
def parse(sentence):

    from corenlp import StanfordCoreNLP

    parser = StanfordCoreNLP()

    data = parser.parse(sentence)
    #print data

    open_file = open("data.json", "wb")

    open_file.write(data)

    open_file.close()
Exemple #10
0
def request_features_from_stanford(data_dir, flag):
    all_sentences, _ = read_tsv(path.join(data_dir, flag + '.tsv'))
    sentences_str = []
    for sentence in all_sentences:
        sentence = ['·' if i == '.' else i for i in sentence]
        if sentence[-1] == '·':
            sentence[-1] = '.'
        sentences_str.append(' '.join(sentence))

    all_data = []
    with StanfordCoreNLP(FULL_MODEL, lang='en') as nlp:
        for sentence in tqdm(sentences_str):

            props = {
                'timeout': '5000000',
                'annotators': 'pos, parse, depparse',
                'pipelineLanguage': 'en',
                'outputFormat': 'json'
            }
            results = nlp.annotate(sentence, properties=props)
            # results = nlp.request(annotators='deparser', data=sentence)
            # results = nlp.request(annotators='pos', data=sentence)
            # result = results['sentences'][0]

            all_data.append(results)
    # assert len(all_data) == len(sentences_str)
    with open(path.join(data_dir, flag + '.stanford.json'),
              'w',
              encoding='utf8') as f:
        for data in all_data:
            json.dump(data, f, ensure_ascii=False)
            f.write('\n')
Exemple #11
0
def request_features_from_stanford(data_dir, flag):
    all_sentences = read_txt(path.join(data_dir, flag + '.txt'))
    sentences_str = []
    for sentence, tags in all_sentences:
        sentence = [change(i) for i in sentence]
        sentences_str.append([sentence, tags])
    all_data = []
    with StanfordCoreNLP(FULL_MODEL, lang='en') as nlp:
        for sentence, tags in tqdm(sentences_str):
            props = {
                'timeout': '5000000',
                'annotators': 'pos, parse, depparse',
                'tokenize.whitespace': 'true',
                'ssplit.eolonly': 'true',
                'pipelineLanguage': 'en',
                'outputFormat': 'json'
            }
            results = nlp.annotate(' '.join(sentence), properties=props)
            results["tags"] = tags
            results["word"] = sentence
            all_data.append(results)
    assert len(all_data) == len(sentences_str)
    with open(path.join(data_dir, flag + '.stanford.json'),
              'w',
              encoding='utf8') as f:
        for data in all_data:
            json.dump(data, f, ensure_ascii=False)
            f.write('\n')
Exemple #12
0
class CoreNLP(object):
    '''Connect CoreNLP server'''
    _NLP = StanfordCoreNLP(
        os.environ.get('CORENLP_URL') or 'http://localhost:9000')
    _LOCAL_DEMO_PROP = {
        'annotators':
        'tokenize, ssplit, pos, lemma, ner, depparse, openie, coref',
        "openie.resolve_coref": "true",
        'outputFormat': 'json'
    }
    _ONLINE_DEMO_PROP = {
        "annotators": "tokenize,ssplit,pos,ner,depparse,openie,coref",
        "coref.md.type": "dep",
        "coref.mode": "statistical",
        'outputFormat': 'json'
    }

    @staticmethod
    def annotate(text):
        '''Get result from CoreNLP via JSON'''
        try:
            return CoreNLP.nlp().annotate(text,
                                          properties=CoreNLP._ONLINE_DEMO_PROP)
        except UnicodeError:
            pprint(text)

    @staticmethod
    def nlp():
        '''Return CoreNLP Server'''
        return CoreNLP._NLP
Exemple #13
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self, corenlp_dir):
        self.parser = StanfordCoreNLP(corenlp_dir)

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.raw_parse(sent)
        tuples = []
        for s in result['sentences']:
            word, pos, dependency = [], [], []
            for dep in s['dependencies']:
                dependency.append({'type': dep[0], 'dep': int(dep[2])-1, 'gov': int(dep[4])-1})
            for w in s['words']:
                word.append(w[0])
                pos.append(w[1]['PartOfSpeech'])
            tuples.append((word, pos, dependency))
        return tuples
Exemple #14
0
class Parser():
    def __init__(self):
        #corenlp_dir = "/export/data/ghpaetzold/simpatico/server_simplifiers/core_nlp/stanford-corenlp-full-2016-10-31/"
        corenlp_dir = "/export/data/cscarton/simpatico/stanford-corenlp-full-2016-10-31/"
        self.corenlp = StanfordCoreNLP(corenlp_dir, memory="4g", properties='galician.myproperties.properties')
    
    def process(self, sentence):
        #sentences = open(self.doc, "r").read().strip().split("\n")
        #sentences = [l.strip().split(' ') for l in f_read]
        #dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        return self.corenlp.raw_parse(sentence)['sentences'][0]

    def transform(self, parsed):
        dict_dep = {}
        for rel, _, head, word, n in parsed['dependencies']:
            
            n = int(n)
            head = int(head)

            if head not in dict_dep.keys():
                dict_dep[head] = {}
            if rel not in dict_dep[head].keys():
                dict_dep[head][rel] = []

            dict_dep[head][rel].append(n)
                


        return dict_dep
Exemple #15
0
def request_features_from_stanford(data_path, do_predict=False):
    data_dir = data_path[:data_path.rfind('/')]
    flag = data_path[data_path.rfind('/') + 1:data_path.rfind('.')]

    if os.path.exists(path.join(data_dir, flag + '.stanford.json')):
        print('The Stanford data file for %s already exists!' % str(data_path))
        return None

    print('Requesting Stanford results for %s' % str(data_path))

    if do_predict:
        all_sentences, _ = read_sentence(data_path)
    else:
        all_sentences, _ = read_tsv(data_path)
    sentences_str = []
    for sentence in all_sentences:
        sentences_str.append(''.join(sentence))

    all_data = []
    with StanfordCoreNLP(FULL_MODEL, lang='zh') as nlp:
        for sentence in tqdm(sentences_str):
            results = nlp.request(annotators='parse,depparse', data=sentence)
            # result = results['sentences'][0]
            result = merge_results(results['sentences'])
            all_data.append(result)
    # assert len(all_data) == len(sentences_str)
    with open(path.join(data_dir, flag + '.stanford.json'),
              'w',
              encoding='utf8') as f:
        for data in all_data:
            json.dump(data, f, ensure_ascii=False)
            f.write('\n')
def sentToParse(Res, num_sents):
    # load corenlp
    sys.path.insert(0, osp.join(ROOT_DIR, 'pyutils', 'corenlp'))
    from corenlp import StanfordCoreNLP
    parser_path = osp.join(ROOT_DIR, 'pyutils', 'corenlp',
                           'stanford-corenlp-full-2015-01-30')
    stanfordParser = StanfordCoreNLP(parser_path)
    num_sents = len(Res) if num_sents < 0 else num_sents
    print 'stanford parser loaded.'
    # start parsing
    num_sents = len(Res) if num_sents < 0 else num_sents
    for i in range(num_sents):
        ref_id, sent = Res[i]['ref_id'], Res[i]['sent']
        parse = stanfordParser.raw_parse(sent)['sentences'][0]
        Res[i]['parse'] = parse
        print '%s/%s sent is parsed.' % (i + 1, num_sents)
Exemple #17
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self, corenlp_dir):
        self.parser = StanfordCoreNLP(corenlp_dir)

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.raw_parse(sent)
        tuples = []
        for s in result['sentences']:
            word, pos, dependency = [], [], []
            for dep in s['dependencies']:
                dependency.append({
                    'type': dep[0],
                    'dep': int(dep[2]) - 1,
                    'gov': int(dep[4]) - 1
                })
            for w in s['words']:
                word.append(w[0])
                pos.append(w[1]['PartOfSpeech'])
            tuples.append((word, pos, dependency))
        return tuples
Exemple #18
0
def getClient():
    socket = TSocket.TSocket('localhost', port)
    socket.setTimeout(10000)
    transport = TTransport.TBufferedTransport(socket)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    client = StanfordCoreNLP.Client(protocol)
    transport.open()
    return client
def remove_tuples0MoreThan1BiasedWord_fromOriginalTuple():
    ## using corenlp to do split up job
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    
    ## load dataset
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t:
        trainTup = json.load(t)
    # with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_stripPuncNum_Dec9.json') as a:
        # verify = json.load(a)
    b =  open('../../devDataclean_Dec8_2015/dev_biasword0ormorethan1_modifiedFile_dec11.txt','w')

    res2 = []
    for num,tup in enumerate(trainTup):
        print num
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]['words']
        temp = []
        for s in slist:
            temp.append(s[0])

        ## count of biased word
        cnum = temp.count(tup[6])
        
        if cnum == 1:
            ## verify if the qualified sent is the same as the split col8 file: dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json
            # if (verify[num][2] == tup[6]) and (verify[num][0] == tup[0]):
            res2.append(tup)
            # else:
                # print "two file are diff"
                # print verify[num]
                # print tup
                # sys.exit()
        else:
            b.write(str(tup)+'\n') 
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_elimBiasWord0orMoreThanOne_fullTup_Dec11.json','w') as f:
        json.dump(res2,f)
    b.close()
Exemple #20
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self, corenlp_dir):
        self.parser = StanfordCoreNLP(corenlp_dir)

        #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar')
    def parse(self, sent):
        """
        Part-Of-Speech tagging
        :param sent: string
        :return: a list of tuple (tokens, pos)
        """
        """
        tokens = []
        pos = []
        result = self.parser.tag(sent.split())
        for entry in result:
            tokens.append(entry[0])
            pos.append(entry[1])
        tuples = [tokens, pos]
        return tuples
        """
        result = self.parser.raw_parse(sent)
        tuples = []
        word, pos = [], []
        for s in result['sentences']:
            for w in s['words']:
                word.append(w[0])
                pos.append(w[1]['PartOfSpeech'])

            pattern = re.compile('\[Text=')
            tokenpattern = re.compile('\[Text=[^\s]+\s')
            pospattern = re.compile('PartOfSpeech=[^\s]+\s')
            startIdxed = []
            for t in re.finditer(pattern, s['parsetree']):
                startIdxed.append(t.start())
            for i in range(len(startIdxed)):
                start = startIdxed[i]
                if i < len(startIdxed) - 1:
                    end = startIdxed[i+1]
                else:
                    end = -1
                token = s['parsetree'][start:end]
                text = re.findall(tokenpattern, token)
                partOfSpeech = re.findall(pospattern, token)
                word.append(text[0][6:-1])
                pos.append(partOfSpeech[0][13:-1])
        tuples.append((word, pos))
        #print tuples
        return tuples
 def __init__(self, fallback=False):
     self.FILE = "nlp_infos.p"
     self.data = None
     self.data_length = None
     self.corenlp_dir = "helper/stanfordnlp/corenlp-python/stanford-corenlp-full-2013-11-12/"
     if fallback:
         try:
             self.corenlp = StanfordCoreNLP(self.corenlp_dir)
         except TIMEOUT:
             print "Stanford CoreNLP Timeout"
    def __init__(self, project):
        """Instantiate and ready the parser. Note that readying the parser takes
        some time.
        """
        self.parser = StanfordCoreNLP(app.config["CORE_NLP_DIR"])
        self.project = project

        logger = logging.getLogger(__name__)
        global project_logger
        project_logger = ProjectLogger(logger, project)
Exemple #23
0
def stanford_parse(data, corpus_name = 'corpus'):
    from time import localtime, strftime
    thetime = strftime("%H:%M:%S", localtime())
    print "\n%s: Initialising CoreNLP... \n" % thetime
    import os
    import ast
    try:
        from corenlp import StanfordCoreNLP
    except:
        raise ValueError("CoreNLP not installed.")
    from corpkit.progressbar import ProgressBar
    corenlp = StanfordCoreNLP()
    if not os.path.exists(corpus_name):
        os.makedirs(corpus_name)
    p = ProgressBar(len(data))
    for index, datum in enumerate(data):
        p.animate(index)
        text = datum[0]
        metadata = datum[1]
        number_of_zeroes = len(str(len(data))) - 1
        filename = str(index).zfill(number_of_zeroes) + '.txt' 
        file_data = []
        parsed_text = ast.literal_eval(corenlp.parse(text))
        trees = []
        raw_texts = []
        for index, sent in enumerate(parsed_text['sentences']):
            syntax_tree = sent['parsetree']
            plain_text = sent['text']
            trees.append(syntax_tree)
            raw_texts.append(plain_text)
                    #subcorpus_path = os.path.join(new_corpus_path, subcorpus_name)
        file_data = ['<raw>' + '\n'.join(raw_texts) + '\n</raw>', '<parse>' + '\n'.join(trees) + '\n</parse>', ]
        if not os.path.exists(os.path.join(corpus_name, metadata)):
            os.makedirs(os.path.join(corpus_name, metadata))
        try:
            fo=open(os.path.join(corpus_name, metadata, filename),"w")
        except IOError:
            print "Error writing file."
        fo.write('\n'.join(file_data))
        fo.close()
    p.animate(len(data))
    print 'Done!'
Exemple #24
0
def parse_file():
    sentence_file = open('sentences.txt', 'w')
    dep_file = open('deps.txt', 'w')
    tree_file = open('trees.txt', 'w')
    abstracts = [line.strip() for line in open('relabs.txt', 'r')]
    corenlp = StanfordCoreNLP()
    for abstract in abstracts:
        parse = corenlp.parse(abstract)
        xml = json.loads(parse)
        sentences = xml['sentences']
        for sentence in sentences:
            # Write sentence
            sentence_file.write(sentence['text'] + "\n")
            # Write parse tree
            tree_file.write(sentence['parsetree'] + "\n")
            # Write dependencies
            for dep in sentence['dependencies']:
                dep_file.write('@'.join(dep) + "\t")
            dep_file.write("\n")
    dep_file.close()
    tree_file.close()
    sentence_file.close()
Exemple #25
0
def split_one(params):
    from corenlp import StanfordCoreNLP
    index, fn, start, end = params
    # skip 'start' line
    lno = 0
    fin = open(fn)
    while lno < start:
        fin.readline()
        lno += 1

    ret = []
    parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False)
    for i in xrange(start, end):
        line = fin.readline()

        ll = line.decode('utf8').strip().split('\t')
        """pay attention to here !!!"""
        # if len(ll) != 3:
        #     continue
        # if not ll[2].endswith('@en'):
        #     continue
        # text = ll[2][1:-4]
        if len(ll) != 2:
            continue
        text = ll[1]
        text = text.replace('\\n', ' ').replace('\\r', ' ')
        try:
            rsp = json.loads(parser.parse(text))

            sentences = []
            for s in rsp['sentences']:
                sentences.append(s['text'])
            ret.append(('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8'))
        except Exception as e:
            print e

    fin.close()
    return ret
Exemple #26
0
def local_split_description(fn_in, fn_out):
    from corenlp import StanfordCoreNLP
    parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False)
    with open(fn_out, 'w') as fout:
        with open(fn_in) as fin:
            for line in fin:
                ll = line.decode('utf8').strip().split('\t')
                if len(ll) != 3:
                    continue
                if not ll[2].endswith('@en'):
                    continue
                text = ll[2][1:-4]

                text = text.replace('\\n', ' ').replace('\\r', ' ')
                try:
                    rsp = json.loads(parser.parse(text))

                    sentences = []
                    for s in rsp['sentences']:
                        sentences.append(s['text'])

                    print >> fout, ('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8')
                except Exception as e:
                    print e.message
    def run(self):
        import sys
        from corenlp import StanfordCoreNLP
        import jsonrpc

        sys.__stdin__ = sys.__stdout__

        server = jsonrpc.Server(
            jsonrpc.JsonRpc20(),
            jsonrpc.TransportTcpIp(addr=("0.0.0.0", int(self.port))))

        nlp = StanfordCoreNLP()
        server.register_function(nlp.parse)
        server.register_function(nlp.parse_file)
        print "registering parse_file"
        server.register_function(lambda *a, **k: 'pong', 'ping')

        try:
            server.serve()
        except KeyboardInterrupt:
            print("%d exiting" % self.port)
def request_features_from_stanford(data_dir, flag):
    all_sentences, _ = read_txt(path.join(data_dir, flag + '.txt'))
    sentences_str = []
    for sentence in all_sentences:
        sentence = [change(i) for i in sentence]
        # if sentence[-1] == '·':
        #     sentence[-1] = '.'
        sentences_str.append(' '.join(sentence))

    all_data = []
    with StanfordCoreNLP(FULL_MODEL, lang='zh', port=randint(38400,
                                                             38596)) as nlp:
        for sentence in tqdm(sentences_str):

            props = {
                'timeout': '5000000',
                'annotators': 'pos, parse, depparse',
                'tokenize.whitespace': 'true',
                'ssplit.eolonly': 'true',
                'pipelineLanguage': 'en',
                'outputFormat': 'json'
            }
            results = nlp.annotate(sentence, properties=props)
            # results = nlp.request(annotators='deparser', data=sentence)
            # results = nlp.request(annotators='pos', data=sentence)
            # result = results['sentences'][0]

            all_data.append(results)
    # assert len(all_data) == len(sentences_str)
    print(all_data)
    with open(path.join(data_dir, flag + '.stanford.json'),
              'w',
              encoding='utf8') as f:
        for data in all_data:
            json.dump(data, f, ensure_ascii=False)
            f.write('\n')
Exemple #29
0
import os
from nltk.tokenize import sent_tokenize
from corenlp import StanfordCoreNLP

# The directory in which the stanford core NLP .jar is located -- you have to
# download this from their website.
CORE_NLP_DIR = "stanford-corenlp-dir/"
PARSER = StanfordCoreNLP(CORE_NLP_DIR)

in_file = "sentences.txt"
text = open(in_file, 'r').read()
sentences = sent_tokenize(text)  # Break the text into sentences.
for i, sentence in enumerate(sentences):
    try:
        parse = PARSER.raw_parse(sentence)
        if i % 50 == 0:
            print " Entered sentence " + str(i) + " of " + str(len(sentences))
        write_parse_products(parse['sentences'][0])
    except Exception:
        print "Error on sentence:\n\t " + sentence + " \n "
        pass


def write_parse_products(self, parse):
    words = parse['words']

    word_objects = []
    text = ""
    for i, word_info in enumerate(words):
        properties = word_info[1]
        token = word_info[0].lower().strip()
#!/usr/bin/env python

import sys, bz2
sys.path.insert(0, '/Users/timpalpant/Documents/Workspace/corenlp-python')
import nltk
from nltk.tree import Tree
from corenlp import StanfordCoreNLP
from remove_random_word import remove_random_word

print("Booting StanfordCoreNLP")
nlp = StanfordCoreNLP()

print("Initializing train file")
train = bz2.BZ2File('../data/train_v2.txt.bz2')
for line in train:
    rline = remove_random_word(line)
    lparse = nlp.raw_parse(line)
    ltree = Tree.fromstring(lparse['sentences'][0]['parsetree'])
    rparse = nlp.raw_parse(rline)
    rtree = Tree.fromstring(rparse['sentences'][0]['parsetree'])
    print(ltree)
    print(rtree)
Exemple #31
0
 def __init__(self):
     #corenlp_dir = "/export/data/ghpaetzold/simpatico/server_simplifiers/core_nlp/stanford-corenlp-full-2016-10-31/"
     corenlp_dir = "/export/data/cscarton/simpatico/stanford-corenlp-full-2016-10-31/"
     self.corenlp = StanfordCoreNLP(corenlp_dir, memory="4g", properties='galician.myproperties.properties')
Exemple #32
0
def stanfordParse(text, corenlpDir='corenlp/stanford-corenlp-full-2014-01-04'):
    global stanford
    if stanford is None:
        stanford = StanfordCoreNLP(corenlpDir)
    return stanford.raw_parse(text)
Exemple #33
0
class MyExtract(object):
    '''
    classdocs
    '''
    def __init__(self):
        '''
        constructor
        '''

        self.rawcorpus = None
        self.corpus = []
        self.pars = []
        self.wordspace = None
        self.docspace = None
        self.stop = set(stopwords.words('english'))
        self.parser = None
        self.prelations = []
        self.nrelations = []

    def buildRawCorpus(self, myfile):
        '''
        extract text from xml files
        '''

        corpus = ""
        for txtfile in glob.glob(devdata + myfile):

            print "reading " + txtfile

            xmldoc = minidom.parse(txtfile)
            itemlist = xmldoc.getElementsByTagName('text')
            for s in itemlist:
                text = s.firstChild.data
                if "." in text:
                    corpus = corpus + " " + text
        self.rawcorpus = corpus.encode("utf-8")

    def buildCorpus(self):
        '''
        preprocess raw text (tokenize, remove stopwords)
        '''

        sents = self.rawcorpus.split(".")
        for sent in sents:
            toks = [
                w.lower() for w in nltk.word_tokenize(sent.decode('utf-8'))
                if w.lower() not in self.stop
            ]
            self.corpus.append(toks)

    def tokenizeAbs(self, parag):
        '''
        preprocess raw text (tokenize, remove stopwords)
        '''

        toks = [
            w.lower() for w in nltk.word_tokenize(parag)
            if w.lower() not in self.stop
        ]
        return toks

    def buildRawSents(self, myfile):

        for txtfile in glob.glob(devdata + myfile):
            xmldoc = minidom.parse(txtfile)
            itemlist0 = xmldoc.getElementsByTagName('document')
            count = 0
            for it0 in itemlist0:
                parag = ""
                itemlist = it0.getElementsByTagName('text')
                for item in itemlist:
                    if '.' in item.firstChild.data:
                        parag = parag + " " + item.firstChild.data
                toks = self.tokenizeAbs(parag.encode("utf-8").decode('utf-8'))
                lab = [txtfile + '_' + ` count `]
                self.pars.append(doc2vec.LabeledSentence(words=toks, tags=lab))
                count = count + 1

    def exploreCDRCorpus(self, myfile, maxsize):
        '''
        extract entities + relations from xml
        '''

        diseases = {}
        chemicals = {}
        relations = []
        xmldoc = minidom.parse(myfile)
        itemlist0 = xmldoc.getElementsByTagName('document')
        count = 0
        for it0 in itemlist0:
            print "\t- processing abstract " + ` count `

            parsed = self.docspace.docvecs[myfile + "_" + ` count `]

            itemlist1 = it0.getElementsByTagName('annotation')
            print "\t\t+ " + ` len(itemlist1) ` + " entities"

            for it1 in itemlist1:

                itemlist2 = it1.getElementsByTagName('infon')
                typ = itemlist2[0].firstChild.data
                mesh = itemlist2[len(itemlist2) - 1].firstChild.data
                text = it1.getElementsByTagName(
                    'text')[0].firstChild.data.lower()
                codes = mesh.split('|')

                for code in codes:
                    ent = MyEntity(text, code, typ)
                    if (typ == 'Chemical'):
                        chemicals[code] = ent
                    if (typ == 'Disease'):
                        diseases[code] = ent

            itemlist3 = it0.getElementsByTagName('relation')

            print "\t\t+ " + ` 2 * len(
                itemlist3) ` + " positive and negative relations"
            print "\t\t\t* extracting features for positive relations"
            print "\t\t\t* extracting features for negative relations"

            for it3 in itemlist3:

                itemlist4 = it3.getElementsByTagName('infon')
                key1 = itemlist4[1].firstChild.data
                key2 = itemlist4[2].firstChild.data
                e1 = chemicals[key1]
                e2 = diseases[key2]
                e1.bow = self.avgBOW(e1.text)
                e2.bow = self.avgBOW(e2.text)
                rel = MyRelation(e1, e2, '1')
                rel.abs = parsed
                self.prelations.append(rel)
                relations.append(key1 + "_" + key2)
                num = 0

            for key1 in chemicals.keys():
                for key2 in diseases.keys():
                    if key1 + "_" + key2 not in relations:
                        if num < len(itemlist3):
                            e1 = chemicals[key1]
                            e2 = diseases[key2]
                            e1.bow = self.avgBOW(e1.text)
                            e2.bow = self.avgBOW(e2.text)
                            rel = MyRelation(e1, e2, '-1')
                            rel.abs = parsed
                            self.nrelations.append(rel)
                            num = num + 1

            count = count + 1
            if (count == maxsize):
                break

    def exploreDDICorpus(self, myfile, maxsize, ftyp):
        '''
        extract entities + relations from xml
        '''

        #print(myfile)

        xmldoc = minidom.parse(myfile)
        itemlist0 = xmldoc.getElementsByTagName('document')
        count = 0

        for it0 in itemlist0:

            # abstract with annotations
            print "\t- processing abstract " + ` count `
            drugs = {}

            # entities
            itemlist1 = it0.getElementsByTagName('annotation')
            print "\t\t+ " + ` len(itemlist1) ` + " entities"
            for it1 in itemlist1:

                itemlist2a = it1.getElementsByTagName('infon')
                typ = itemlist2a[0].firstChild.data
                print typ

                itemlist2b = it1.getElementsByTagName('text')
                text = itemlist2b[0].firstChild.data.lower()
                print text

                ent = MyEntity(text, "", typ)
                ent.bow = self.avgBOW(ent.text)
                drugs[text] = ent

            # abstract
            itemlist3 = it0.getElementsByTagName('text')
            abstract = ""
            for it3 in itemlist3:
                if (len(it3.firstChild.data.split()) > 3):
                    abstract = abstract + it3.firstChild.data

            # parse abstract
            parsed = self.parseSentence(abstract)  #stanford
            docvec = self.docspace.docvecs[myfile + "_" + ` count `]  #doc2vec

            #print len(drugs.keys())

            if (len(drugs.keys()) > 1):

                e1 = drugs[drugs.keys()[0]]
                e2 = drugs[drugs.keys()[1]]
                e1.bow = self.avgBOW(e1.text)
                e2.bow = self.avgBOW(e2.text)

                #print(ftyp)

                if (ftyp == "positive"):

                    #print(parsed)

                    rel = MyRelation(e1, e2, '1')
                    rel.abs = docvec
                    rel.parse = parsed.encode("utf-8")
                    self.prelations.append(rel)

                if (ftyp == "negative"):

                    #print(docvec)

                    rel = MyRelation(e1, e2, '-1')
                    rel.abs = docvec
                    rel.parse = parsed.encode("utf-8")
                    self.nrelations.append(rel)

            # increment counter
            count = count + 1
            if (count == maxsize):
                break

    def avgBOW(self, entity):
        bow = []
        ents = entity.split(" ")
        i = 0
        while i < self.wordspace.layer1_size:
            v = 0
            for ent in ents:
                if ent in self.wordspace.vocab:
                    v = v + self.wordspace[ent][i]
            bow.append(v / len(ents))
            i = i + 1
        return np.array(bow)

    def buildWordSpace(self, modelfile):
        '''
        compute distributional model
        '''

        model = Word2Vec(self.corpus,
                         min_count=1,
                         size=20,
                         iter=100,
                         workers=4)
        model.save(modelfile)
        self.wordspace = model

    def buildDocSpace(self, modelfile):
        '''
        compute distributional model
        '''

        model = doc2vec.Doc2Vec(self.pars,
                                min_count=5,
                                size=20,
                                iter=100,
                                workers=4)
        model.save(modelfile)
        self.docspace = model

    def loadWordSpace(self, modelfile):
        '''
        compute distributional model
        '''

        self.wordspace = Word2Vec.load(devdata + modelfile)

    def loadDocSpace(self, modelfile):
        '''
        compute distributional model
        '''

        self.docspace = doc2vec.Doc2Vec.load(devdata + modelfile)

    def loadParser(self):

        corenlp_dir = os.environ['STANFORD']
        self.parser = StanfordCoreNLP(corenlp_dir +
                                      "/")  # wait a few minutes...

    def parseSentence(self, sentence):

        parsed = self.parser.raw_parse(sentence)['sentences'][0]['parsetree']
        return parsed
Exemple #34
0
def run(debug, host, port, close, memory, input, output, arango, user, project,
        limit, pictures, summary, relations, corefs, newgraph, documentedges):
    uwsgi.cache_update('busy', b'1')

    if debug:
        logging.basicConfig(level=logging.DEBUG)
        logging.debug("Debug on.")
    else:
        logging.basicConfig(level=logging.INFO)

    nlp_bytes = None
    nlp_bytes = uwsgi.cache_get('nlp')

    # Set progress bar start parameters
    if nlp_bytes:
        init_time = 2
    else:
        init_time = 10

    if pictures or summary:
        nlp_time = 60
    else:
        nlp_time = 80

    yield "data:1\n\n"

    # If standford corenlp server host and port given use that, otherwise start a new instance through python wrapper
    if host and port:
        if nlp_bytes:
            temp_nlp = pickle.loads(nlp_bytes)
            temp_nlp.close()

        nlp = StanfordCoreNLP(host, port)
        uwsgi.cache_update('nlp', pickle.dumps(nlp))
        logging.debug("nlp to cache: host {}".format(uwsgi.cache_get('nlp')))
    elif nlp_bytes:
        nlp = pickle.loads(nlp_bytes)
        logging.debug("nlp from cache: {}".format(uwsgi.cache_get('nlp')))
    else:
        nlp = StanfordCoreNLP(r'../deps/stanford-corenlp/',
                              memory=memory,
                              timeout=200000,
                              quiet=not debug)
        uwsgi.cache_update('nlp', pickle.dumps(nlp))
        logging.debug("nlp to cache: file {}".format(uwsgi.cache_get('nlp')))

    DOC_CHUNK_SIZE = 10000

    # Initialise corenlp properties, s3 bucket connection, and doc count for progress bar
    data, n_items, properties, s3 = init(input,
                                         output,
                                         nlp,
                                         relations=relations,
                                         corefs=corefs,
                                         chunk_size=DOC_CHUNK_SIZE,
                                         limit=limit)
    logging.debug("items to process: {}".format(n_items))

    logging.debug("Loading CoreNLP models...")

    # Load corenlp models in separate thread to allow to send regular pings to the frontend
    server_init_thread = Thread(target=nlp.annotate, args=("", properties))
    server_init_thread.start()

    while server_init_thread.is_alive():
        time.sleep(30)
        yield "data:1\n\n"
    else:
        server_init_thread.join()
        yield "data:" + str(init_time) + "\n\n"

    # Create or load existing networkx graph object for this project
    graph_path = os.path.join(output, user, project, "nlp_outputs",
                              'graph_temp.pkl')
    if not newgraph:
        if output[:5] == 's3://' and s3.exists(graph_path):
            with s3.open(graph_path, 'rb') as f:
                logging.debug("Reading existing graph...")
                G = nx.read_gpickle(f)
        elif os.path.isfile(graph_path):
            G = nx.read_gpickle(graph_path)
        else:
            G = nx.MultiGraph()
    else:
        if arango:
            r = requests.delete("http://" + arango + "/ingest/" + user + "/" +
                                project + "/")
        G = nx.MultiGraph()

    # Main NLP parsing loop. Run corenlp annotator pipeline, resolve coreferences and extract relations. Then load into networkx graph
    i = 0
    for document in parse_docs(data,
                               input,
                               output,
                               user,
                               project,
                               nlp,
                               properties,
                               chunk_size=DOC_CHUNK_SIZE,
                               limit=limit,
                               s3=s3):
        yield "data:" + str(int(i / n_items * nlp_time) + init_time) + "\n\n"

        if corefs:
            resolve_coreferences(document[1])
            yield "data:" + str(int(i / n_items * nlp_time) +
                                init_time) + "\n\n"

        for r in make_entity_relationships(document[0],
                                           document[1],
                                           document[2],
                                           document[3],
                                           relations=relations,
                                           documentedges=documentedges):
            key_suffix = r.semantic_type or ""
            G.add_edge(r.entity1._key,
                       r.entity2._key,
                       key=r.type + key_suffix,
                       source_file=r.source_file,
                       word_dist=r.word_dist,
                       document_id=r.document_id,
                       document_date=r.document_date,
                       from_char_offset=(r.e1_char_start, r.e1_char_end),
                       to_char_offset=(r.e2_char_start, r.e2_char_end),
                       semantic_type=r.semantic_type,
                       label_first=r.entity1.label_orig,
                       label_second=r.entity2.label_orig)

            nodes = []
            elements1 = r.entity1.__dict__
            nodes.append((r.entity1._key, elements1))
            elements2 = r.entity2.__dict__
            nodes.append((r.entity2._key, elements2))

            G.add_nodes_from(nodes)
        yield "data:" + str(int(i / n_items * nlp_time) + init_time) + "\n\n"
        i += 1

    # Close the NLP server if required. Keep open to avoid model loading next time
    if close:
        nlp.close()
        uwsgi.cache_del('nlp')

    logging.debug("Calculating same sentence centrality...")
    set_type_centrality(G, "same_sentence")

    if documentedges:
        yield "data:" + str(init_time + nlp_time + 2) + "\n\n"
        set_type_centrality(G, "same_document")
        yield "data:" + str(init_time + nlp_time + 5) + "\n\n"
    else:
        yield "data:" + str(init_time + nlp_time + 5) + "\n\n"

    # Write graph object to JSON representation
    out_data = json_graph.node_link_data(G)

    # Serialise and write the graph object for use in next upload
    if output[:5] == 's3://':
        with s3.open(graph_path, 'wb') as f:
            nx.write_gpickle(G, f)
    else:
        nx.write_gpickle(G, graph_path)

    del G

    # remove and rename output variables to fit data api requirements
    out_data.pop('directed')
    out_data.pop('multigraph')

    out_data['vertices'] = out_data.pop('nodes')
    out_data['edges'] = out_data.pop('links')

    # Run wikipedia lookups of thumbnail urls and article summaries
    if pictures or summary:
        processes = []
        with ThreadPoolExecutor(max_workers=None) as executor:
            for idx, v in enumerate(out_data['vertices']):
                v.pop('id')

                if v['_key'].split("_")[-1] not in ('LOCATION', 'MISC',
                                                    'ORGANIZATION', 'PERSON',
                                                    'COREF'):
                    url = 'https://en.wikipedia.org/wiki/' + v['_key']
                    processes.append(
                        executor.submit(getWikiImageSummary, url, pictures,
                                        summary, idx))

            i = 0
            for task in as_completed(processes):
                logging.debug(
                    "Finished processing vertex: {} out of {}".format(
                        i + 1, len(processes)))
                imageurl, summarytext, idx = task.result()
                out_data['vertices'][idx]['image_url'], out_data['vertices'][
                    idx]['summary'] = imageurl, summarytext
                if i % 10 == 0:
                    yield "data:" + str(
                        int(i / len(processes) * (80 - nlp_time)) + nlp_time +
                        init_time + 5) + "\n\n"
                i += 1

    # More renaming to fit data api requirements
    for e in out_data['edges']:
        e['_from'] = "vertices/" + clean_label(e.pop('source'))
        e['_to'] = "vertices/" + clean_label(e.pop('target'))
        e['type'] = e.pop('key')[:13]
        e['_key'] = str(uuid.uuid4())

    yield "data:96\n\n"

    # Either load data into arango db, or save json representation to file system or s3
    LINE_LIMIT = 100000

    if arango:
        logging.debug("sending: {}, {}, {}".format(arango, user, project))

        send_to_arango(out_data,
                       arango,
                       user,
                       project,
                       LINE_LIMIT,
                       doc_type="vertices")
        yield "data:97\n\n"

        send_to_arango(out_data,
                       arango,
                       user,
                       project,
                       LINE_LIMIT,
                       doc_type="same_sentence")

        yield "data:98\n\n"

        if documentedges:
            logging.debug("adding document edges")
            send_to_arango(out_data,
                           arango,
                           user,
                           project,
                           LINE_LIMIT,
                           doc_type="same_document")

    else:
        edges_ss = [
            e for e in out_data['edges'] if e['type'] == "same_sentence"
        ]

        if documentedges:
            edges_sd = [
                e for e in out_data['edges'] if e['type'] == "same_document"
            ]

        write_list_in_chunks(out_data['vertices'], LINE_LIMIT // 10, output,
                             user, project, 'vertices')
        yield "data:97\n\n"
        write_list_in_chunks(edges_ss, LINE_LIMIT, output, user, project,
                             'edges_ss')
        yield "data:98\n\n"
        if documentedges:
            write_list_in_chunks(edges_sd, LINE_LIMIT, output, user, project,
                                 'edges_sd')

    uwsgi.cache_del('busy')
    yield "data:100\n\n"
Exemple #35
0
class BasicStanfordCoreNLP(UtteranceProcessor):
    '''
    Basic version doesn't do anything with coref, const. and depend. parses produced by analysis.
    
    For now, words from all sentences found in the utterance are put at the top level
    of the utterance -- sentences are throw away, but could be used later for e.g.
    paragraph-level utterances. 
    
    If merge_clitics, merge e.g. I 'll -> single word I'll
    
    Add spaces back in where there is no punctuation as points at which silence can
    be inserted during alignment
    
    Add reduced POS as well as Stanford POS
    '''
    def load(self):
    
        self.target_nodes = self.config.get('target_nodes', '//utt')    
        self.input_attribute = self.config.get('input_attribute', 'norm_text')
        
        self.merge_clitics = self.config.get('merge_clitics', 'True') ## string, not bool
    
        ## check tools exist:
        corenlp_location = os.path.join(self.voice_resources.path[c.BIN], '..', \
                                                            'corenlp-python', 'corenlp')
        assert os.path.isdir(corenlp_location)
        sys.path.append(corenlp_location)
        from corenlp import StanfordCoreNLP
        corenlp_dir = os.path.join(corenlp_location, '..', 'stanford-corenlp-full-2014-06-16')
        
        ## Each document is to be treated as one sentence, no sentence splitting at all. 
        ## Write config for this if necessary:
        corenlp_conf_name = 'no_sentence_split.properties'
        corenlp_conf_file = os.path.join(corenlp_location, corenlp_conf_name)
        if not os.path.isfile(corenlp_conf_file):
            data = ['annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref', \
                    'ssplit.isOneSentence = true']
            writelist(data, corenlp_conf_file)

        print 'Loading stanford corenlp modules from %s ...'%(corenlp_dir)
        print 'Takes a while (~20-30 seconds)...'
        self.models = StanfordCoreNLP(corenlp_dir, properties=corenlp_conf_name)     
                                           

                                                            
    def process_utterance(self, utt):

        ## _END_ node
        end_node = Element('token')
        end_node.set(self.input_attribute, '_END_')
        utt.append(end_node)

        for node in utt.xpath(self.target_nodes):
            
            assert node.has_attribute(self.input_attribute)
            input = node.get(self.input_attribute)
            analysis = self.models.raw_parse(input)
            
            ## analysis looks like this:
            
            #     {'coref': ...
            #      'sentences': [{'parsetree':  ... } 
            #                     'text': 
            #                     'dependencies': 
            #                     'indexeddependencies': 
            #                     'words': [('and', {'NamedEntityTag': 'O', \
            #                         'CharacterOffsetEnd': '3', 'Lemma': 'and', \
            #                         'PartOfSpeech': 'CC', 'CharacterOffsetBegin': '0'}), ... ]
            #                       }
            #                   ]
            #     }
            
            ## preprocess the analysis: add spaces back between words where there is no
            ## punc (to use as potential silence insertion points for alignment), and
            ## possibly merge clitics (he 's -> he's, i ll' -> i'll)
            

            ## MERGE SUCCESSIVE PUNCTUATION TOKENS 
            new_analysis = {}
            new_analysis['sentences'] = []
            for sentence in analysis['sentences']:
                #new_sentence = copy.deepcopy(sentence)
                #new_sentence['words'] = []
                new_words = []
                for word in sentence['words']:
                    # is there a previous word?
                    if len(new_words) > 0:
                        # if both space / punct:
                        if self.all_space_or_punc(new_words[-1][0]) and self.all_space_or_punc(word[0]):
                            prev_word = new_words.pop(-1)
                            combined = self.merge_words(prev_word, word)
                            new_words.append(combined)
                        else:
                            new_words.append(word)
                    else:
                        new_words.append(word)
                sentence['words'] = new_words
                new_analysis['sentences'].append(sentence)
            analysis = new_analysis     


            ## MERGE CLITICS 
            ## This also merges e.g. . ''  -->  .''  (given by norm scripts from   ."  ) at sentence ends.
            if self.merge_clitics == 'True': ## string not bool
                new_analysis = {}
                new_analysis['sentences'] = []
                for sentence in analysis['sentences']:
                    #print sentence
                    new_sentence = copy.deepcopy(sentence)
                    new_sentence['words'] = []
                    i = 0
                    while i < (len(sentence['words'])-1):
                        this_word = sentence['words'][i]
                        next_word = sentence['words'][i+1]
                        if next_word[0].startswith("'") or next_word[0] == "n't":
                            merged = self.merge_words(this_word, next_word)
                            new_sentence['words'].append(merged)
                            i += 2
                        else:
                            new_sentence['words'].append(this_word)
                            i += 1
                    last_word = sentence['words'][-1]
                    if not(last_word[0].startswith("'") or last_word[0] == "n't"):
                        new_sentence['words'].append(last_word)
                    new_analysis['sentences'].append(new_sentence)
                analysis = new_analysis                    
                 
            
            ## ADD SPACES:
            new_analysis = {}
            new_analysis['sentences'] = []
            for sentence in analysis['sentences']:
                new_sentence = copy.deepcopy(sentence)
                new_sentence['words'] = []
                ## For now, ignore parsetree, dependencies, indexeddependencies (sentence level)
                previous_lemma = '_NONE_'
                for word in sentence['words']:
                
                    (text, word_attributes) = word
                    this_lemma = word_attributes['Lemma']
                    
                    ## Add whitespace back in to tokens to use for silence insertion in alignment later.
                    ## Don't add it where either neighbour is punctuation, or at start of 
                    ## utt (where previous_lemma is '_NONE_':
                    if not (self.all_space_or_punc(previous_lemma) or \
                                                self.all_space_or_punc(this_lemma)):   
                        if previous_lemma != '_NONE_':                     
                            new_sentence['words'].append((' ', {'NamedEntityTag': ' ', \
                                                        'PartOfSpeech': ' ', 'Lemma': ' '}))
                    previous_lemma = this_lemma
                    new_sentence['words'].append(word)
                new_analysis['sentences'].append(new_sentence)
            analysis = new_analysis
            
            
            ## combine all sentences to one for now:
            all_words = []
            for sentence in analysis['sentences']:
                all_words.extend(sentence['words'])
                
            
            ## Add stuff into the target node (probably utt):
            for word in all_words:
            
                (text, word_attributes) = word
                word_node = Element('token') ## also includes punctuation etc.
                word_node.set(self.input_attribute, text) ## see above at sentence level about 'text'
                
                ## For now, ignore CharacterOffsetBegin, CharacterOffsetEnd (word level)
                word_node.set('ne', word_attributes['NamedEntityTag']) 
                word_node.set('pos', word_attributes['PartOfSpeech']) 
                word_node = self.add_reduced_POS(word_node)
                
                word_node.set('lemma', word_attributes['Lemma']) 
                
                utt.append(word_node)
                
        ## _END_ node
        end_node = Element('token')
        end_node.set(self.input_attribute, '_END_')
        utt.append(end_node)    

    def add_reduced_POS(self, node):
        full_POS = node.attrib['pos']
        if '|' in full_POS:
            full_POS = full_POS.split('|')[0]
    
        ## add coarse POS (content/function) and reduced (adj,noun,adv,etc.)
        map = dict([('IN', 'function'), ('TO', 'function'), ('DT', 'function'), \
                ('PDT', 'function'), ('MD', 'function'), ('CC', 'function'), \
                ('WP', 'function'), ('PP$', 'function'), ('EX', 'function'), \
                ('POS', 'function'), ('PP', 'function'), ('WDT', 'function'), \
                ('PRP', 'function'), ('PRP$', 'function'), ('RP', 'function'), \
                ('WP$', 'function'), ('WRB', 'function'), ('LS', 'function'),\
                ('NN', 'noun'), ('NNS', 'noun'), \
                ('NP', 'noun'), ('NNP', 'noun'), ('NPS', 'noun'), ('NNPS', 'noun'), ('FW', 'noun'), \
                 ('VBG', 'verb'), ('VBN', 'verb'), \
                ('VB', 'verb'), ('VBD', 'verb'), ('VBP', 'verb'), ('VBZ', 'verb'), \
                ('JJ', 'adj'), ('JJR', 'adj'), ('JJS', 'adj'), ('CD', 'adj'), \
                ('RB', 'adv'), ('RBR', 'adv'), ('RBS', 'adv'), ('UH', 'interj')])

                ## NOTE:
                # FW -- foreign word -> noun
                # LS -- list item -> function

        if full_POS not in map:
            if full_POS == ' ':
                red_pos = 'space'
            elif self.all_space_or_punc(full_POS):
                red_pos = 'punc'
            else:
                print 'MISSING POS: %s'%(full_POS)
                red_pos = 'other'
        else:
            red_pos = map[full_POS]
        node.set('coarse_pos', red_pos)
        return node


    

    def all_space_or_punc(self, token):
        '''Use regex to match unicode properties to see if token is all punctuation or space
            This duplicates later work by e.g. token classifier.'''
        space_or_punc = '[\p{Z}||\p{C}||\p{P}||\p{S}]'
        return regex.match('\A' + space_or_punc + '+\Z', token)
        
        
    def merge_words(self, word1, word2):
        merged_form = word1[0] + word2[0]
        merged_POS = word1[1]['PartOfSpeech'] + '|' + word2[1]['PartOfSpeech']
        merged_lemma = word1[1]['Lemma']   ## first word's lemma
        merged_NER = word1[1]['NamedEntityTag']  ## first words NE tag
        merged = (merged_form, \
                    {'PartOfSpeech': merged_POS, \
                    'Lemma': merged_lemma, \
                    'NamedEntityTag': merged_NER})
        return merged
import os
from nltk.tokenize import sent_tokenize
from corenlp import StanfordCoreNLP

# The directory in which the stanford core NLP .jar is located -- you have to
# download this from their website.
CORE_NLP_DIR = "stanford-corenlp-dir/"
PARSER = StanfordCoreNLP(CORE_NLP_DIR)

in_file = "sentences.txt"
text = open(in_file, 'r').read()
sentences = sent_tokenize(text)  # Break the text into sentences.
for i, sentence in enumerate(sentences):
	try:
		parse = PARSER.raw_parse(sentence)
		if i%50 == 0:
			print " Entered sentence " + str(i) + " of " + str(len(sentences))
		write_parse_products(parse['sentences'][0])
	except Exception:
		print "Error on sentence:\n\t " + sentence + " \n "
		pass

def write_parse_products(self, parse):
	words = parse['words']

	word_objects = []
	text = ""
	for i, word_info in enumerate(words):
		properties = word_info[1]
		token = word_info[0].lower().strip()
		surface = word_info[0].strip()
class StringProcessor(object):
    """Tokenize or parse a string.
    """

    def __init__(self, project):
        """Instantiate and ready the parser. Note that readying the parser takes
        some time.
        """
        self.parser = StanfordCoreNLP(app.config["CORE_NLP_DIR"])
        self.project = project

        logger = logging.getLogger(__name__)
        global project_logger
        project_logger = ProjectLogger(logger, project)

    def tokenize(self, txt):
        """Turn a string of one or more ``Sentence``\s into a list of
        ``Sentence`` objects. This method will also tokenize each word in txt,
        find its PoS, lemma, and space_before.

        :param str txt: One or more sentences, in a string format.
        :return list: A list of document.Sentence objects.
        """
        sentences = []

        for sentence_text in split_sentences(txt):
            sentence = self.parse_with_error_handling(sentence_text)
            sentences.extend(tokenize_from_raw(sentence, sentence_text,
                self.project))

        return sentences

    def parse(self, sentence, relationships=None, dependencies=None,
            max_length=30):
        """Parse a ``Sentence`` and extract dependencies, parse trees, etc.

        Note that for max_length, a "word" is defined as something with a space
        on at least one side. This is not the typical definition of "word".
        This is done so that length can be checked before resources are
        committed to processing a very long sentence.

        :param Sentence sentence: The ``Sentence`` object.
        :param int max_length: The most amount of words to process.
        """

        parsed = self.parse_with_error_handling(sentence.text)

        # If the parse was unsuccessful, exit
        if parsed == None:
            return

        parsed_sentence = parsed["sentences"][0]

        if len(parsed["sentences"]) > 1:
            project_logger.warning("More than one sentence passed in to"
                " StringProcessor.parse().")
            parsed_sentence["text"] += parsed["sentences"][1]["text"]

        for dependency in parsed_sentence["dependencies"]:
            # We don't want to make a dependency involving ROOT
            if int(dependency[2]) > 0 and int(dependency[4]) > 0:
                governor = dependency[1]
                dependent = dependency[3]
                governor_index = int(dependency[2]) - 1
                dependent_index = int(dependency[4]) - 1
                governor_pos = parsed_sentence["words"][governor_index][1]\
                    ["PartOfSpeech"]
                governor_lemma = parsed_sentence["words"][governor_index][1]\
                    ["Lemma"]
                dependent_pos = parsed_sentence["words"][dependent_index][1]\
                    ["PartOfSpeech"]
                dependent_lemma = parsed_sentence["words"][dependent_index][1]\
                    ["Lemma"]
                grammatical_relationship = dependency[0]

                # If dictionaries are present, run with duplication handling
                if relationships != None and dependencies != None:
                    key = grammatical_relationship

                    if key in relationships.keys():
                        relationship = relationships[key]
                    else:

                        try:
                            relationship = GrammaticalRelationship.query.\
                                filter_by(name = grammatical_relationship).\
                                one()
                        except(MultipleResultsFound):
                            project_logger.error("duplicate records found "
                                "for: %s", str(key))
                        except(NoResultFound):
                            relationship = GrammaticalRelationship(
                                name = grammatical_relationship)

                        relationships[key] = relationship

                    # Read the data for the governor, and find the
                    # corresponding word
                    governor = Word.query.filter_by(
                        word = governor,
                        lemma = governor_lemma,
                        part_of_speech = governor_pos
                    ).first()

                    # Same as above for the dependent in the relationship
                    dependent = Word.query.filter_by(
                        word = dependent,
                        lemma = dependent_lemma,
                        part_of_speech = dependent_pos
                    ).first()

                    try:
                        governor.id
                        dependent.id
                    except:
                        project_logger.error("Governor or dependent not "
                            "found; giving up on parse. This likely indicates"
                            " an error in the preprocessing; rerunning the "
                            "preprocessor is recommended.")
                        project_logger.info(sentence)
                        return sentence

                    key = (relationship.name, governor.id, dependent.id)

                    if key in dependencies.keys():
                        dependency = dependencies[key]
                    else:

                        try:
                            dependency = Dependency.query.filter_by(
                                grammatical_relationship = relationship,
                                governor = governor,
                                dependent = dependent
                            ).one()
                        except(MultipleResultsFound):
                            self.logg_error(("duplicate records found for: %s",
                                str(key)))
                        except(NoResultFound):
                            dependency = Dependency(
                                grammatical_relationship = relationship,
                                governor = governor,
                                dependent = dependent
                            )

                        dependencies[key] = dependency

                    # Add the dependency to the sentence
                    sentence.add_dependency(
                        dependency = dependency,
                        governor_index = governor_index,
                        dependent_index = dependent_index,
                        project = self.project,
                        force = False
                    )

                    dependency.save(False)

                else:
                    # TODO: fill
                    pass

        return sentence

    def parse_with_error_handling(self, text):
        """Run the parser and handle errors properly.

        Also checks the sentence text for irregularities that may break the
        parser and handles it before proceeding.

        Any failure will cause this method to return None

        :param str text: The text of the sentence to check
        """

        # Check for non-string
        if not isinstance(text, str) and not isinstance(text, unicode):
            project_logger.warning("Parser got a non-string argument: %s",
                text)
            return None

        # Check for non-unicode
        if not isinstance(text, unicode):

            # Try to convert the string to unicode if possible
            # Unit test: should fail with this example:
            # http://stackoverflow.com/questions/6257647/convert-string-to-unicode

            try:
                text = unicode(text)
            except(UnicodeDecodeError):
                project_logger.warning("The following sentence text is "
                    "not unicode; convertion failed.")
                project_logger.info(text)

                # Skip sentence if flag is True
                if app.config["SKIP_SENTENCE_ON_ERROR"]:
                    return None
                else:
                    # Try to parse the sentence anyway
                    project_logger.warning("Attempting to parse "
                        "non-unicode sentence.")

        # Check for empty or nonexistent text
        if text == "" or text == None:
            return None

        # Check for irregular characters
        # TODO: what are considered irregular characters?

        # Try to parse, catch errors
        parsed_text = None
        try:
            parsed_text = self.parser.raw_parse(text)
        # TODO: handle all errors properly
        # ProcessError, TimeoutError, OutOfMemoryError
        except TimeoutError as e:
            project_logger.error("Got a TimeoutError: %s", str(e))
            return None
        except ProcessError as e:
            project_logger.error("Got a ProcessError: %s", str(e))
            return None
        except:
            project_logger.error("Unknown error")
            return None

        # Parse successful, return parsed text
        return parsed_text
class Nlp_persistence(object):
    """Persistence layer for having fast access to information produced by the StanfordCoreNLP tool."""
    def __init__(self, fallback=False):
        self.FILE = "nlp_infos.p"
        self.data = None
        self.data_length = None
        self.corenlp_dir = "helper/stanfordnlp/corenlp-python/stanford-corenlp-full-2013-11-12/"
        if fallback:
            try:
                self.corenlp = StanfordCoreNLP(self.corenlp_dir)
            except TIMEOUT:
                print "Stanford CoreNLP Timeout"

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def close(self):
        # When exiting, update pickle file with new sentences and kill StanfordCoreNLP before so we definitely have enough memory for that
        try:
            del(self.corenlp)
        except AttributeError:
            # There was a timeout
            pass

        # Write only if we added something to self.data
        if self.data_length < len(self.data):
            self._write()

    def create_persistence(self, relations):
        try:
            # Trying to load data
            data = pickle.load(open(self.FILE, "rb"))
        except (IOError, EOFError):
            # No data so far
            print "Could not open cache. Create new."
            logging.info("Could not find %s. Create new data.", self.FILE)
            data = {}

        # Create nlp information for all relevant sentences
        for relation in relations:
            if not relation.source.sentence in data:
                self._update_data(relation.source, data)
            else:
                print "Sentence is already in data"

            if not relation.target.sentence in data:
                self._update_data(relation.target, data)
            else:
                print "Sentence is already in data"
        print "Done!"
        logging.info("Successfully loaded all nlp information to persistence file.")

        # Save data to a file
        pickle.dump(data, open(self.FILE, "wb"), protocol=-1)

    def _update_data(self, entity, data):
        sentence_obj = entity.sentence
        try:
            tree = self._get_tree(sentence_obj)
        except RPCInternalError:
            logging.error("Could not process the following sentence from text %s: %s", sentence_obj.filename, sentence_obj.text)
            # Return without updating data
            return

        print "--- " + sentence_obj.filename
        print sentence_obj.text

        data.update({sentence_obj: tree})

    def load(self):
        data = {}

        if self.data is None:
            try:
                data = pickle.load(open(self.FILE, "rb"))
            except (IOError, EOFError):
                logging.warning("No cached nlp data.")
            finally:
                self.data = data
                self.data_length = len(data)
        else:
            # Data is already there - there is nothing to do
            pass

    def get_info_for_sentence(self, sentence):
        if type(self.data) is dict:
            try:
                return self.data[sentence]
            except KeyError:
                logging.error("Nlp_persistence: This sentence is not a key/Is not available in the Nlp persistence layer.")
                logging.info("Nlp_persistence fallback to CoreNLP server")
                # Fallback: Try to get tree from CoreNLP server
                tree = self._get_tree(sentence)

                # Drive by caching
                self.data.update({sentence: tree})

                return tree
        else:
            logging.error("You have to use Nlp_persistence.load() before you can get the information of a sentence")
            return None

    def get_collapsed_dependencies(self, sentence):
        info = self.get_info_for_sentence(sentence)

        return info['sentences'][0]['dependencies']

    def get_parse_tree(self, sentence):
        info = self.get_info_for_sentence(sentence)

        return info['sentences'][0]['parsetree']

    def _write(self):
        # Save data to a file
        pickle.dump(self.data, open(self.FILE, "wb"))

    def _get_tree(self, sentence):
        tree = self.corenlp.raw_parse(sentence.text)
        return tree

    def get_pos_tag_for_word(self, sentence, word):
        """Returns the POS tag for a word in a sentence. If the word is not in the sentence raise WordNotInSentence error."""
        info_sentence = self.get_info_for_sentence(sentence)
        words = info_sentence['sentences'][0]['words']

        for w in words:
            if w[0] in word:
                return w[1]["PartOfSpeech"]
        else:
            raise PosTagNotFound(sentence, word)

    def get_lemma_for_word(self, sentence, word):
        """Returns the lemma for a word in sentence."""
        info_sentence = self.get_info_for_sentence(sentence)
        words = info_sentence['sentences'][0]['words']

        for w in words:
            if w[0] in word:
                return w[1]["Lemma"]
        else:
            raise LemmaNotFound(sentence, word)

    def is_main_verb(self, sentence, word):
        """Returns true if word is a main verb of sentence and not an aux."""
        info_sentence = self.get_info_for_sentence(sentence)
        dependencies = info_sentence['sentences'][0]['dependencies']

        for dependency in dependencies:
            if dependency[0] == "aux" and dependency[2] == word:
                return False
        else:
            return True

    def get_all_aux_for_verb(self, sentence, verb):
        """Returns all distinct aux for verb as strings in order of the sentence."""
        info_sentence = self.get_info_for_sentence(sentence)
        dependencies = info_sentence['sentences'][0]['dependencies']

        aux = []
        for dependency in dependencies:
            if (dependency[0] == "aux" or dependency[0] == "auxpass") and dependency[1] == verb:
                aux.append(dependency[2])

        return aux

    def get_verb_for_aux(self, sentence, aux):
        """Returns the governing verb for the aux as string."""
        info_sentence = self.get_info_for_sentence(sentence)
        dependencies = info_sentence['sentences'][0]['dependencies']

        for dependency in dependencies:
            if dependency[0] == "aux" and dependency[2] == aux:
                return dependency[1]
        else:
            raise AuxNotFound(aux)

    def find_all_verb_pos_tags(self, sentence, verb):
        """Returns all pos tags for all verbs based on the dependencies relation of the sentence."""

        if self.is_main_verb(sentence, verb):
            # verb is not an aux
            main_verb = verb
        else:
            # verb is aux (this should normally not happen due to the data)
            main_verb = self.get_verb_for_aux(sentence, verb)

        auxes = self.get_all_aux_for_verb(sentence, main_verb)

        verb_pos = self.get_pos_tag_for_word(sentence, main_verb)

        aux_pos = map(lambda aux: self.get_pos_tag_for_word(sentence, aux), auxes)

        return aux_pos + [verb_pos]

    def get_governing_verb(self, event):
        sentence = event.sentence

        # info = [verb, aux, pos verb, pos aux, index_of_verb]
        info = self.get_info_on_governing_verb(event.text, event.index, sentence)

        if info is None:
            raise CouldNotFindGoverningVerb
        else:
            if info[0] is None:
                raise CouldNotFindGoverningVerb
            else:
                return (info[0], info[4])

    def is_root(self, event):
        sentence = event.sentence
        info_sentence = self.get_info_for_sentence(sentence)

        collapsed_dependencies = info_sentence['sentences'][0]['dependencies']

        for dependency in collapsed_dependencies:
            dependency_type = dependency[0]
            dependent = dependency[2]

            if dependency_type == "root" and dependent == event.text:
                return True
        else:
            return False

    def get_info_on_governing_verb(self, non_verb, index, sentence):
        """This method returns information about the governing verb of a non-verb.

        It returns an array with the following format: [verb, aux, POS of verb, POS of aux, index_of_verb]
        """
        info = self.get_info_for_sentence(sentence)

        if info:
            # Search for non_verb
            governing_verb, index = self._get_governing_verb(non_verb, index, info)

            info_on_governing_verb = [governing_verb, None, None, None, index]

            # Set POS of main verb
            pos_verb = self._get_pos_of_verb(governing_verb, info)
            info_on_governing_verb[2] = pos_verb

            # Searching for an Aux for the governing verb
            aux = self._get_aux_of_verb(governing_verb, info)
            info_on_governing_verb[1] = aux

            # If there is an aux, get it's POS
            if aux:
                pos_aux = self._get_pos_of_verb(aux, info)
                info_on_governing_verb[3] = pos_aux

            return info_on_governing_verb

        else:
            return None

    def _get_aux_of_verb(self, verb, info):
        dependencies = info['sentences'][0]['dependencies']

        sources = [x[1] for x in dependencies]

        # Find index of verb in targets
        index = None
        for i, source in enumerate(sources):
            if source == verb and dependencies[i][0] == "aux":
                index = i

        # Get aux
        if index is None:
            # Not every verb has an aux
            return None
        else:
            aux = dependencies[index][2]

            return aux

    def _get_pos_of_verb(self, verb, info):
        info_on_words = info['sentences'][0]['words']

        for word in info_on_words:
            if word[0] == verb:
                return word[1]['PartOfSpeech']

    def _find_governing_word(self, word, dependencies):
        for dependency in dependencies:
            if dependency[2] == word:
                return dependency[1]
        else:
            return None

    def _find_governing_word_index(self, word, index, index_dependencies):
        word = word + "-" + str(index)

        for dependency in index_dependencies:
            if dependency[2] == word:
                # Remove governor with index appended
                return dependency[1]
        else:
            return None

    def _remove_index_from_token(self, token):
        if token:
            token = token.split("-")[:-1]
            return "-".join(token)
        else:
            return None

    def _get_index_from_token(self, token):
        if token:
            index = token.split("-")[-1]
            return index
        else:
            return None

    def _get_governing_verb(self, non_verb, index, info):
        index_dependencies = info['sentences'][0]['indexeddependencies']

        # Try to find a governor for non_verb
        governor = self._find_governing_word_index(non_verb, index, index_dependencies)

        # Search through tree as long we find a verb and until we can go further up
        while not self._is_verb(self._remove_index_from_token(governor), info) and governor is not None:
            old_governor = governor
            governor = self._find_governing_word_index(self._remove_index_from_token(governor), self._get_index_from_token(governor), index_dependencies)

            if governor == old_governor:
                # Detected circle (does not happen often, but happens. Not sure why.)
                governor = None
                break

        if governor:
            # Remove index from governor string
            return (self._remove_index_from_token(governor), int(self._get_index_from_token(governor)))
        else:
            # Examples when this is allowed to happen:
            # Example for when it happens: "And in Hong Kong, a three percent drop." <- no verb
            # Other example: "One exception was the swine flu pandemic of 2009-2010, when 348 children died." and "pandemic". "pandemic" is the root of the sentence and is not governed by anything
            # Other corner case: "And the dominant flu strain early in the season was one that tends to cause more severe illness." for "season"
            raise CouldNotFindGoverningVerb(non_verb, index)

    def _is_verb(self, text, info):
        """Checks if text has the POS tag of a verb."""
        if not text: return False

        words = info['sentences'][0]['words']

        for word in words:
            if word[0] == text:
                if word[1]['PartOfSpeech'] in ['VBG', 'VBD', 'VB', 'VBN', 'VBP', 'VBZ']:
                    return True

        return False
Exemple #39
0
#!/usr/bin/env python

import os
import sys
import csv
import json

sys.path.append(os.path.expanduser('~/github/stanford-corenlp-python'))
from corenlp import StanfordCoreNLP
corenlp = StanfordCoreNLP()

# I propose this csv dialect for all our bolt csv files
# as a first step into standardizing our data files
csv.register_dialect('bolt',
                     quoting=csv.QUOTE_ALL,
                     doublequote=False,
                     escapechar='\\',
                     lineterminator='\n')


def parse_sentence(s):
    """Returns a dictionary with the parse results returned by
    the Stanford parser for the provided sentence."""
    return json.loads(corenlp.parse(s, verbose=False))['sentences'][0]


if __name__ == '__main__':
    with open('sentences.csv', 'rb') as fi, open('sentences2.csv', 'wb') as fo:
        reader = csv.reader(fi)
        writer = csv.writer(fo, 'bolt')
from corenlp import StanfordCoreNLP
import pprint

if __name__ == '__main__':
    nlp = StanfordCoreNLP('http://localhost:9000')
    # text = "Non tolerance was gandhijis weapon."
    # text = ("We went to pitapit,it can be expensive but not hygienic.")
    # text = ("The dishes at Alkareem are highly recommended.")
    text = ("The sitting which is mostly outdoor is the prettiest you can come across in CP")
    # text = ('I loved The Crispy Vegetables but found the Wontons to be devoid of any flavor')
    # text = ("delicious veg manchurian.")
    # text = ('London is good at studies but bad at sports.')
    # text = ("The tiger prawns here,it doesn't get better.")
    # text = ('Check out the pics to find out who greeted me on my first visit to Bercos CP branch, it can be expensive but not hygienic.')
    
    output = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse,ner',
        'outputFormat': 'json'
    })
    # pprint.pprint(output)
    tree = output['sentences'][0]['parse']
    print tree
    x = output['sentences'][0]['collapsed-dependencies']
    # pprint.pprint(x)
    print '-------------------------------------------------'
    for i in range(len(x)):
        print x[i]['dep'] + '-->' + x[i]['governorGloss'] + '-' + str(x[i]['governor']) + ' ' + x[i]['dependentGloss'] + '-' + str(x[i]['dependent'])
    # print(output['sentences'][0]['parse'])
    # output = nlp.tokensregex(text, pattern='/Pusheen|Smitha/', filter=False)
    # print(output)
    # output = nlp.semgrex(text, pattern='{tag: VBD}', filter=False)
Exemple #41
0
 def get_data_raw(self):
     snlp = StanfordCoreNLP(
         corenlp_path=os.path.dirname(self.files()[0].path))
     return snlp
Exemple #42
0
 def __init__(self, corenlp_dir):
     self.parser = StanfordCoreNLP(corenlp_dir)
# Get the corpus-file open
corpusjson = 'protest.json'
jsonobject = json.load(codecs.open(corpusjson))


# Get and clean the text: 
texts = (clean_html(parse(StringIO(obj[4].replace("\n", " ")))).getroot().text_content() for obj in jsonobject)
print "Story text generator object created."


# Turn it into a string object, then an html object, then back into string...
#texts = clean_html(parse(StringIO(text))).getroot().text_content()

print "Setting up parser: "
# Set up the parser
stanford_parser = StanfordCoreNLP()

print "Creating parser generator object: "
# Parse dat. 
parsed_texts = (stanford_parser.parse(unicode(text)) for text in texts)

# Save the result to a file
# Not sure how enumerate() works with generators; ostensibly a wrapper which 
# retains laziness, but I don't wanna risk it and introduce more variables. 
i = 0       # So, it's gross. Whatever. 
for story in parsed_texts: 
    i += 1
    with codecs.open(str(i)+".json", 'w') as fh: 
        json.dump(json.loads(story), fh, indent=2)

def typedependencies(sent_list, neg_words, compound_word_list):

    pos_dict = {}
    depend_dict = {}
    depend_list = []
    proper_names = []
    # neg_words = []
    compound_dic = {}

    nlp = StanfordCoreNLP('http://localhost:9000')
    for i in range(len(sent_list)):
        compound_list = []
        print sent_list[i]
        output = nlp.annotate(sent_list[i],
                              properties={
                                  'annotators':
                                  'tokenize,ssplit,pos,depparse,parse,ner',
                                  'outputFormat': 'json'
                              })
        # pprint.pprint(output)
        x = output['sentences'][0]['basic-dependencies']
        # pprint.pprint(output['sentences'][0]['parse'])
        # pprint.pprint(x)
        # print '-------------------------------------------------'
        for j in range(len(x)):

            if 'compound' in x[j]['dep']:
                # compound_word(x[j])
                ll = [
                    x[j]['governorGloss'], x[j]['governor'],
                    x[j]['dependentGloss'], x[j]['dependent']
                ]
                compound_dic[x[j]['governor']] = x[j]['governorGloss']
                compound_dic[x[j]['dependent']] = x[j]['dependentGloss']
                # compound_list.append(ll)

            d = [
                x[j]['dep'], x[j]['governorGloss'],
                str(x[j]['governor']), x[j]['dependentGloss'],
                str(x[j]['dependent'])
            ]
            depend_list.append(d)

            # getting the negative words..
            if 'neg' in x[j]['dep']:
                x1 = x[j]['governorGloss'].lower()
                x2 = x[j]['dependentGloss'].lower()
                if x1 not in stopwords:
                    neg_words.append([x1, x[j]['governor']])
                else:
                    neg_words.append([x2, x[j]['dependent']])

            if 'conj' in x[j]['dep']:
                x1 = x[j]['governorGloss'].lower()
                x2 = x[j]['dependentGloss'].lower()
                if x1 in neg_prefix:
                    neg_words.append([x2, x[j]['dependent']])
                # elif (x2 == 'not' or x2 == 'nor' or x2 == 'non'):
                #   neg_words.append(x1)
                elif x2 in neg_prefix:
                    neg_words.append([x1, x[j]['governor']])

            print(x[j]['dep'] + '-->' + x[j]['governorGloss'] + '-' +
                  str(x[j]['governor']) + ' ' + x[j]['dependentGloss'] + '-' +
                  str(x[j]['dependent']))
        print '==================================='

        for key, value in sorted(compound_dic.items()):
            compound_list.append([key, value])
        # print compound_word(compound_list)
        compound_dic.clear()

        y = output['sentences'][0]['tokens']
        for k in range(len(y)):
            pos_dict[y[k]['word']] = y[k]['pos']
            if 'NNP' in y[k]['pos']:
                proper_names.append(y[k]['word'])

        depend_dict[i] = depend_list
        depend_list = []

        if len(compound_list) > 0:
            w = compound_word(compound_list)
        else:
            w = []
        for jj in range(len(w)):
            if w[jj] != '':
                print w[jj]
                compound_word_list.append(w[jj])

    print '--------NAMES------' + str(proper_names)
    print '--------NEGATIVE----' + str(neg_words)
    return depend_dict, pos_dict, proper_names
Exemple #45
0
from nltk.tokenize import sent_tokenize
from nltk.tag.stanford import NERTagger
from nltk.parse.stanford import StanfordParser
from corenlp import StanfordCoreNLP

wsj = open('wsj_0063.txt')

#extract named entities
nerTagger=NERTagger('stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-08-27/stanford-ner.jar')
ner = []
for line in wsj:
	ner.append(nerTagger.tag(unicode(line,errors='ignore').split()))

#parse sentences
paragraph = ""
for line in wsj:
	paragraph += line.replace('\n',' ')
sentences = sent_tokenize(paragraph)
parser = StanfordParser('stanford-parser-full-2014-10-31/stanford-parser.jar','stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar')
parsed = parser.raw_parse_sents(sentences)

#coreference
corenlp_dir = "stanford-corenlp-full-2014-08-27"
corenlp = StanfordCoreNLP(corenlp_dir)
corenlp.batch_parse(paragraph)

wsj.close()
Exemple #46
0
# -- coding: utf-8 --
import jsonrpc
from simplejson import loads
from socket import error as SocketError
import errno
from corenlp import StanfordCoreNLP
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')
corenlp_dir = "stanford-corenlp-full-2014-08-27/"
#server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080),timeout=200.0))
server = StanfordCoreNLP(corenlp_dir)

orig_file = open('pubmed_1000.csv', 'r')
new_file = open('coref-1000.csv', 'w')
count = 0
gotdata = 1
result = []
for line in orig_file.readlines():
    cols = line.split('\t')
    message =  cols[2]

    simplemessage = "Stanford University is located in California. It is a great university."
    print "Sending line: " + str(count)
    data = server.parse(message)
    '''
    while not gotdata:
        try:
            print "Sending line: " + str(count)
            data = server.parse(message)
from corenlp import StanfordCoreNLP
import pprint

if __name__ == '__main__':
    nlp = StanfordCoreNLP('http://localhost:9000')
    # text = "Non tolerance was gandhijis weapon."
    # text = ("We went to pitapit,it can be expensive but not hygienic.")
    # text = ("The dishes at Alkareem are highly recommended.")
    text = (
        "The sitting which is mostly outdoor is the prettiest you can come across in CP"
    )
    # text = ('I loved The Crispy Vegetables but found the Wontons to be devoid of any flavor')
    # text = ("delicious veg manchurian.")
    # text = ('London is good at studies but bad at sports.')
    # text = ("The tiger prawns here,it doesn't get better.")
    # text = ('Check out the pics to find out who greeted me on my first visit to Bercos CP branch, it can be expensive but not hygienic.')

    output = nlp.annotate(text,
                          properties={
                              'annotators':
                              'tokenize,ssplit,pos,depparse,parse,ner',
                              'outputFormat': 'json'
                          })
    # pprint.pprint(output)
    tree = output['sentences'][0]['parse']
    print tree
    x = output['sentences'][0]['collapsed-dependencies']
    # pprint.pprint(x)
    print '-------------------------------------------------'
    for i in range(len(x)):
        print x[i]['dep'] + '-->' + x[i]['governorGloss'] + '-' + str(
def scrape_func(address, website):
    """
    Function to scrape various RSS feeds. Uses the 'keep' and 'ignore'
    iterables to define which words should be used in the text search.

    Inputs
    ------
    address : address for the RSS feed to scrape. String.

    website : name of the website to scrape to be used in the filepath for the
    output. String.

    database : name of the MongoDB database that contains the collections.
    String? pymongo connection object?
    """
    connection = MongoClient()
    db = connection.atrocities_data
    collection = db[website]

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    corenlp_dir = 'stanford-corenlp/'
    corenlp_parse = StanfordCoreNLP(corenlp_dir)

    log = open('log_file.txt', 'a')
    results = pattern.web.Newsfeed().search(address, count=100, cached=False)
    log1 = 'There are %d results from %s \n' % (len(results), website)
    log.write(log1)
    for result in results:
        if website == 'nyt':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'bbc':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'reuters':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'ap':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'upi':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'xinhua':
            page_url = result.url.encode('ascii')
            page_url = page_url.replace('"', '')
            text = pages_scrape.scrape(page_url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'google':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
    interupt = '+' * 70
    log3 = '%s\nScrape %s once at %s!\n%s\n' % (interupt, website,
                                                datetime.datetime.now(),
                                                interupt)
    log.write(log3)
    log.close()
Exemple #49
0
from corenlp import StanfordCoreNLP
import simplejson as json

corenlp_dir = "/home/clai/lubbock/repos-3rd/stanford-corenlp-python/stanford-corenlp-full-2015-04-20/"

print "loading..."
corenlp = StanfordCoreNLP(corenlp_dir)

results = corenlp.raw_parse("Hello world. It's a wonderful day.")
print results

print json.dumps(results, indent=4)
Exemple #50
0
 def __init__(self, corenlp_dir):
     self.parser = StanfordCoreNLP(corenlp_dir)
	# 	# paragraph += re.sub(r'\([^)]*\)', '',d['line'])+' '
	# episodeLines.append(paragraph.replace('\n','').replace('                           ',' '))
	outfile = open('scripts/'+e+'.txt','w')
	paragraph = ""
	for d in searcher.documents(episode=e):
		outfile.writelines(d['line'].encode('utf-8')+' ')
		# outfile.writelines((d['speaker']+': '+d['line']).encode('utf-8')+' ')
	# 	paragraph += d['speaker']+': '+d['line']+' '
	# 	# paragraph += re.sub(r'\([^)]*\)', '',d['line'])+' '
	# paragraph = paragraph.replace('\n','').replace('                           ',' ')
	# outfile.writelines(paragraph.encode('utf-8'))
	outfile.close()

parsed = []
corenlp_dir = "stanford-corenlp-full-2014-08-27"
corenlp = StanfordCoreNLP(corenlp_dir)
for e in episodeNum:
	for d in searcher.documents(episode=e):
		parsed.append(corenlp.raw_parse(d))

# sentClient = StanfordNLPSentimentClient('http://localhost:8080')
# sentiment = []
# for t in text:
# 	sentiment.append(sentClient.classify(t))

# mask = imread("friends.gif")
wc = WordCloud(max_words=30,stopwords=STOPWORDS|{'s','t','m','re','oh','right','don','know','well','hey','gonna','okay','yeah','go','really','think','hi','uh','look','god','mean','one','ye','guy','y','got','come','now'},font_path='/Users/elaine/Library/Fonts/Berlin.ttf')
for c in mainChars:
	wc.generate(lines[uniqueSpeakers.index(c)])
	wc.to_file(c+".png")
 def __init__(self):
     corenlp_dir = "corenlp-python/stanford-corenlp-full-2014-08-27/"
     self.corenlp = StanfordCoreNLP(corenlp_dir)  # wait a few minutes...
     print("corenlp object initiated")
def compress(sentence):
    global parser
    if not parser:
        parser = StanfordCoreNLP(corenlp_dir)
    text = sentence.simple
    words = word_tokenize(text)
    w_features = [dict() for w in words]
    stemmed = [None for w in words]

    labels = list()


    # add basic features

    # first/last words
    for i in range(1,6):
        if i < len(words):
            for x in range(i):
                w_features[x]["infirst"+str(i)] = True
                w_features[-1-x]["inlast"+str(i)] = True

    #pos = [ x[1] for x in nltk.pos_tag(a.o_words) ]

    for i in range(len(words)):
        w = words[i]
        features = w_features[i]


        #capitalization
        if w.isupper():
            features["isupper"] = True
        elif w[0].isupper():
            features["firstupper"] = True

        w = w.lower()

        #word class
        if w in negation:
            features["negation"] = True
        elif w in punct:
            features["punct"] = True
        elif w in stopWords:
            features["stopWords"] = True

        #pos
        #a.posfeatures[i]["pos_"+pos[i]] = True

        # compute the basic term frequencies of all words in paragraphs
        # for use in building corpus-wide quarry term frequency
        if w not in model.idf.stopWords:
            termFreq[w] += 1

        stem = stemmer.stem(w)
        suffix = ""
        if len(stem) < len(w) and w.startswith(stem):
            suffix = w[len(stem):]
        stemmed[i] = (stem, suffix)

        features["stem_"+stemmed[i][0]] = True
        features["affix_"+stemmed[i][1]] = True


    #Stanford tree features
    text = text.encode('ascii', 'ignore')

    
    tree = None
    dependencies = None

    try:
        results = parser.raw_parse(text)
        tree = []
        dependencies = []

        for s in results['sentences']:
            tree.append(tree_re.search(s['parsetree']).group(0))
            dependencies += s['dependencies']


    except:
        print(text)
        print( "Unexpected error:", sys.exc_info()[0])


    #print(a.tree)
    if tree:
        tree = Tree.fromstring(tree[0].encode('ascii', 'ignore'))
        #print(str(tree))
        paths = list(getPathsToLeaves(tree))
        #print(paths)
        for i in range(min(len(paths), len(words))):
            #print(paths[i][1])
            w_features[i]["tree_depth_"+str(len(paths[i][1]))] = True
            for x in range(0,2):
                w_features[i][str(x)+"_up_"+paths[i][1][-1-x]] = True
            for n in paths[i][1]:
                w_features[i]["tree_"+n] = True
            w_features[i][str(paths[i][2])+"_from_left"] = True
        #print(a.treefeatures[0])
    if dependencies:
        #make a tree out of it
        d_tree = defaultdict(list)
        mother_relations = defaultdict(list)
        daughter_relations = defaultdict(list)
        for dep in dependencies:
            d_tree[dep[1]].append((dep[0], dep[2]))
            mother_relations[dep[1]].append(dep[0])
            daughter_relations[dep[2]].append(dep[0])

        #now we can check depth and such
        #print(d_tree)
        depths = getDepths(d_tree, u'ROOT', dict(), 0)
        #print(depths)

        for i in range(len(words)):
            w = words[i]
            treefeatures = w_features[i]
            if w in depths:
                w_depth = depths[w]
                treefeatures["dep_depth_"+str(w_depth)] = True
                if w_depth > 3:
                    treefeatures["dep_depth_over_3"] = True
                if w_depth > 5:
                    treefeatures["dep_depth_over_5"] = True
            if w in mother_relations:
                for rel in mother_relations[w]:
                    treefeatures["dep_mother_"+rel] = True
            if w in daughter_relations:
                for rel in daughter_relations[w]:
                    treefeatures["dep_daughter_"+rel] = True

    # get max tfidf for scaling
    maxtfidf = max( tf*idf.idf[w] for w, tf in termFreq.items() )

    partitions = 5

    # now add tfidf threshold features
    for i in range(len(words)):
        w = words[i].lower()
        if w not in stopWords and w not in punct:
            features = w_features[i]

            tfidf = termFreq[w] * idf.idf[w]
            scaled = tfidf / maxtfidf * partitions
            for x in range(1,partitions):
                if tfidf > x:
                    features[str(x*100/partitions)+"percenttfidf"] = True

    #for f in w_features:
    #    print(f)


    # add previous features and classify
    for i in range(len(words)):

        f = w_features[i].copy()

        for prev in range(2):
            if i > prev:
                prevstring = "prev"+str(prev)+"_"
                f[prevstring+labels[-1-prev]] = True

                prevfeatures = w_features[i-1-prev]
                for k,v in prevfeatures.items():
                    if not k.startswith("in"):
                        f[prevstring+k] = v

        #print("with prev:")
        #print(f)

        # classify
        vector = vec.transform(f)
        vector = selector.transform(vector)
        result = classifier.predict(vector)
        l = result[0]
        #print(l)

        labels.append(l)

    # use labels to clear out
    print(labels)

    retained_words = list()
    for i in range(len(labels)):
        if labels[i] != 'O':
            retained_words.append(words[i])

    newsentence = ""
    for i in range(len(retained_words)):
        if i != 0 and retained_words[i] not in punct and retained_words[i-1] not in ["``"]:
            newsentence += " "
        newsentence += retained_words[i]
        
    sentence.simple = newsentence

    return sentence
class StanforExtractor(object):
    def __init__(self):
        corenlp_dir = "corenlp-python/stanford-corenlp-full-2014-08-27/"
        self.corenlp = StanfordCoreNLP(corenlp_dir)  # wait a few minutes...
        print("corenlp object initiated")

    def tag_text(self, text):
        """
        :param text:
        :return:
        """
        assert type(text) == str
        sents = self.corenlp.raw_parse(text)
        return sents

    def expand_rels_double(self, rel_words, sent):
        """
        :param rel_words: [wrd1,wrd2]
        :param sent: in tagged_text['sentences'], ['dependencies'] for each sent
        :return:
        """
        assert type(rel_words) == list
        assert type(sent) == list
        assert len(rel_words) == 2
        rel_tmp = [rel_words[0], rel_words[1]]
        for rel_1 in sent:
            if rel_1[1] == rel_words[0] and rel_1[2] == rel_words[1]:
                continue
            rel_1 = list(rel_1)
            # print(rel_1)
            # if prep_ or prepc_ is the tag
            # appos_tag = 1
            neg_tag = 0
            if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"):
                middle_word = rel_1[0][rel_1[0].find("_") + 1 :]
                rel_1 = [rel_1[1], middle_word, rel_1[2]]
            elif rel_1[0] == u"appos":
                rel_1 = [rel_1[1], rel_1[2]]
                # appos_tag = -1
            elif rel_1[0] == u"neg":
                # neg_tag = 1
                rel_1 = [rel_1[1], rel_1[2]]
            else:
                continue
                # rel_1 = [rel_1[1],rel_1[2]]
            if rel_words[0] in rel_1:
                append_start = 1
                rel_1.remove(rel_words[0])
            elif rel_words[1] in rel_1:
                append_start = -1
                rel_1.remove(rel_words[1])
            else:
                continue
            # append_start = append_start*appos_tag
            # if neg_tag == 1:
            #
            if append_start == 1:
                rel_tmp = [" ".join(rel_1)] + rel_tmp
            else:
                rel_tmp = rel_tmp + [" ".join(rel_1)]
        return rel_tmp

    def expand_rels_wordlist(self, rel_words, sent):
        """
        :param rel_words: [wrd1,wrd2,..]
        :param sent: in tagged_text['sentences'], ['dependencies'] for each sent
        :return:
        """
        assert type(rel_words) == list
        assert type(sent) == list
        rel_tmp = []
        for rel_1 in sent:  # for each word in sentence, rel_1 is the relation mapper from stanford tagger dependencies
            # if rel_1[1] in rel_words and rel_1[2] in rel_words:
            #     continue
            rel_1 = list(rel_1)
            # print(rel_1)
            # if prep_ or prepc_ is the tag
            # appos_tag = 1
            neg_tag = 0
            if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"):
                middle_word = rel_1[0][rel_1[0].find("_") + 1 :]
                rel_1 = [rel_1[1], middle_word, rel_1[2]]
            elif rel_1[0] == u"appos":
                rel_1 = [rel_1[1], rel_1[2]]
                # appos_tag = -1
            elif rel_1[0] == u"neg":  # what to do here?
                # neg_tag = 1
                rel_1 = [rel_1[1], rel_1[2]]
            else:
                continue
            wrd_present = False
            for wrd in rel_1:
                if wrd in rel_words:
                    rel_1.remove(wrd)
                    wrd_present = True
            if wrd_present:
                # pdb.set_trace()
                if len(rel_1) > 0:
                    rel_tmp.append(" ".join(rel_1))
        return " ".join(rel_tmp)

    def expand_rels(self, tmp_rels, sent):
        """
        add relevant sents to start or end of tmp_rels
        :param tmp_rels:
        :param sent:
        :return:
        """
        # pdb.set_trace()
        print("sent", sent)
        final_rels = []
        for rel_full in tmp_rels:
            rel_words = [rel_full[1], rel_full[2]]
            rel_tmp = self.expand_rels_double(rel_words, sent)
            final_rels.append(rel_tmp)
        # print('final_res:',final_rels)
        return final_rels

    def identify_rels(self, tagged_text):
        """
        :param tagged_text:
        :return:
        """
        assert "sentences" in tagged_text.keys()
        assert "dependencies" in tagged_text["sentences"][0].keys()
        all_rels = []
        for sent in tagged_text["sentences"]:
            tmp_rels = []
            for rel in sent["dependencies"]:
                if rel[0] in [u"nn", u"dobj"]:
                    tmp_rels.append(rel)
            if len(tmp_rels) > 0:
                final_rels = self.expand_rels(tmp_rels, sent["dependencies"])
                all_rels.append(final_rels)
        return all_rels

    def identify_word_rels(self, all_words, tagged_text):
        """
        :param all_words: list of words/phrases
        :param tagged_text:
        :return:
        """
        assert "sentences" in tagged_text.keys()
        assert "dependencies" in tagged_text["sentences"][0].keys()
        words_rels = {}
        # pdb.set_trace()
        for wrd in all_words:
            wrd_rels = []
            for sent in tagged_text["sentences"]:
                rel_frm_sent = self.expand_rels_wordlist(wrd.split(), sent["dependencies"])
                if len(rel_frm_sent) > 0:
                    wrd_rels.append(rel_frm_sent)
            words_rels[wrd] = ",".join(wrd_rels)
        return words_rels

    def identify_time(self, text):
        """
        :param text:
        :return:
        """
        time_strs = []
        text_tag = self.tag_text(text)
        for sent in text_tag["sentences"]:
            words = sent["words"]
            prev_wrd_tag = False
            for wrd in words:
                wrd_tag = wrd[1]
                assert type(wrd_tag) == dict
                # if u'Timex' in wrd_tag:
                #     timex_string = wrd_tag['Timex']
                #     new_end = timex_string.rfind('</TIMEX3>')
                #     timex_string = timex_string[:new_end]
                #     new_start = timex_string.rfind('>')
                #     time_word = timex_string[new_start+1:]
                #     time_strs.append(time_word)
                if u"NamedEntityTag" in wrd_tag:
                    if wrd_tag[u"NamedEntityTag"] in [u"DATE", u"TIME"]:
                        if not prev_wrd_tag:
                            time_strs.append(wrd[0])
                        else:
                            prev_wrd = time_strs.pop()
                            new_wrd = prev_wrd + " " + wrd[0]
                            time_strs.append(new_wrd)
                        prev_wrd_tag = True
                    else:
                        prev_wrd_tag = False
                else:
                    prev_wrd_tag = False
        time_final = []
        for wrd in time_strs:
            if wrd not in time_final:
                time_final.append(wrd)
        return time_final

    def ret_time_rels(self, text):
        """
        :param text:
        :return:
        """
        tagged_text = self.tag_text(text)
        all_times = self.identify_time(text)
        time_rels = self.identify_word_rels(all_times, tagged_text)
        return time_rels

    def return_rels(self, text):
        """
        :param text:
        :return:
        """
        text_tag = self.tag_text(text)
        rels_all = self.identify_rels(text_tag)
        return rels_all

    def identify_name(self, text):
        """
        :param text:
        :return:
        """
        name_strs = []
        text_tag = self.tag_text(text)
        for sent in text_tag["sentences"]:
            words = sent["words"]
            prev_wrd_tag = False
            for wrd in words:
                wrd_tag = wrd[1]
                assert type(wrd_tag) == dict
                # if u'Timex' in wrd_tag:
                #     timex_string = wrd_tag['Timex']
                #     new_end = timex_string.rfind('</TIMEX3>')
                #     timex_string = timex_string[:new_end]
                #     new_start = timex_string.rfind('>')
                #     time_word = timex_string[new_start+1:]
                #     time_strs.append(time_word)
                if u"NamedEntityTag" in wrd_tag:
                    if wrd_tag[u"NamedEntityTag"] in [u"PERSON"]:
                        if not prev_wrd_tag:
                            name_strs.append(wrd[0])
                        else:
                            prev_wrd = name_strs.pop()
                            new_wrd = prev_wrd + " " + wrd[0]
                            name_strs.append(new_wrd)
                        prev_wrd_tag = True
                    else:
                        prev_wrd_tag = False
                else:
                    prev_wrd_tag = False
        names_final = []
        for wrd in name_strs:
            if wrd not in names_final:
                names_final.append(wrd)
        return names_final
Exemple #55
0
def stanfordParse(text, corenlpDir='stanford-corenlp-full-2013-11-12/'):
    global stanford
    if stanford is None:
        stanford = StanfordCoreNLP(corenlpDir)
    return stanford.raw_parse(text)
Exemple #56
0
# -*- coding: utf-8 -*-
from corenlp import StanfordCoreNLP
import time

local_corenlp_path = './tmp'

# Simple usage
nlp = StanfordCoreNLP(local_corenlp_path)

sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
print('Tokenize:', nlp.word_tokenize(sentence))
print('Part of Speech:', nlp.pos_tag(sentence))
print('Named Entities:', nlp.ner(sentence))
print('Constituency Parsing:', nlp.parse(sentence))
print('Dependency Parsing:', nlp.dependency_parse(sentence))

del nlp
time.sleep(10)
# Other human languages support, e.g. Chinese
nlp = StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False)

sentence = '清华大学位于北京。'
print(nlp.word_tokenize(sentence))
print(nlp.pos_tag(sentence))
print(nlp.ner(sentence))
print(nlp.parse(sentence))
print(nlp.dependency_parse(sentence))

del nlp
time.sleep(10)
# General Stanford CoreNLP API
Exemple #57
0
    def loadParser(self):

        corenlp_dir = os.environ['STANFORD']
        self.parser = StanfordCoreNLP(corenlp_dir +
                                      "/")  # wait a few minutes...
Exemple #58
0
    u'.'
]

ahs_test = "And be it further enacted, That the seat of government of said Territory is hereby located temporarily at Fort Leavenworth; and that such portions of the public buildings as may not be actually used and needed for military purposes, may be occupied and used, under the direction of the Governor and Legislative Assembly, for such public purposes as may be required under the provisions of this act."

# Make socket
transport = TSocket.TSocket(server, port)

# Buffering is critical. Raw sockets are very slow
transport = TTransport.TBufferedTransport(transport)

# Wrap in a protocol
protocol = TBinaryProtocol.TBinaryProtocol(transport)

# Create a client to use the protocol encoder
client = StanfordCoreNLP.Client(protocol)

# Connect!
transport.open()

# This list is for options for how we'd like the output formatted.  See README.md for the full list of possible options.
# Note that the DEFAULT is what you would get if you specified "oneline" on the command line, or "None" here.
# You have to pass in something, and unfortunately it doesn't seem like that something can be None or an empty list.
# See http://diwakergupta.github.io/thrift-missing-guide/#_defining_structs for a possible explanation as to why...
# So, the following examples are VALID values for the second argument to these parse_* methods.
# (There are, of course, many more valid combinations depending on what the Stanford Parser supports.)
#outputOptions = ["-outputFormat", "typedDependencies,penn", "-outputFormatOptions", "basicDependencies"]
outputOptions = ["-outputFormat", "oneline"]
#outputOptions = ["-outputFormat", "typedDependencies"]
'''
try:
Exemple #59
0
from corenlp import StanfordCoreNLP

corenlp_dir = "../../Scripts/stanford-corenlp-full-2014-08-27/"
corenlp = StanfordCoreNLP(corenlp_dir)  # wait a few minutes...

result = corenlp.raw_parse(
    "What is birth date of the wife of the first black president of the United States?"
)

print((result['sentences'][0]['dependencies']))
def typedependencies(sent_list,neg_words,compound_word_list):

    pos_dict = {}
    depend_dict = {}
    depend_list = []
    proper_names = []
    # neg_words = []
    compound_dic = {}
    
    nlp = StanfordCoreNLP('http://localhost:9000')
    for i in range(len(sent_list)):
        compound_list = []
        print sent_list[i]
        output = nlp.annotate(sent_list[i], properties={
                    'annotators': 'tokenize,ssplit,pos,depparse,parse,ner',
                    'outputFormat': 'json'
                    })
        # pprint.pprint(output)
        x = output['sentences'][0]['basic-dependencies']
        # pprint.pprint(output['sentences'][0]['parse'])
        # pprint.pprint(x)
        # print '-------------------------------------------------'
        for j in range(len(x)):
         
            if 'compound' in x[j]['dep']:
                # compound_word(x[j])
                ll = [x[j]['governorGloss'],x[j]['governor'],
                        x[j]['dependentGloss'],x[j]['dependent']]
                compound_dic[x[j]['governor']] = x[j]['governorGloss']
                compound_dic[x[j]['dependent']] = x[j]['dependentGloss']
                # compound_list.append(ll)

            d = [x[j]['dep'],x[j]['governorGloss'],str(x[j]['governor'])
                ,x[j]['dependentGloss'],str(x[j]['dependent'])]
            depend_list.append(d)


            # getting the negative words..
            if 'neg' in x[j]['dep']:
                x1 = x[j]['governorGloss'].lower()
                x2 = x[j]['dependentGloss'].lower()
                if x1 not in stopwords:
                    neg_words.append([x1,x[j]['governor']])
                else:
                    neg_words.append([x2,x[j]['dependent']])

            if 'conj' in x[j]['dep']:
                x1 = x[j]['governorGloss'].lower()
                x2 = x[j]['dependentGloss'].lower()
                if x1 in neg_prefix:
                    neg_words.append([x2,x[j]['dependent']])
                # elif (x2 == 'not' or x2 == 'nor' or x2 == 'non'):
                #   neg_words.append(x1)
                elif x2 in neg_prefix:
                    neg_words.append([x1,x[j]['governor']])

            print (x[j]['dep'] + '-->' + x[j]['governorGloss'] + '-' 
                + str(x[j]['governor']) + ' ' + x[j]['dependentGloss'] +
                 '-' + str(x[j]['dependent']))
        print '==================================='
        

        for key,value in sorted(compound_dic.items()):
            compound_list.append([key,value])
        # print compound_word(compound_list)  
        compound_dic.clear()
        

        y = output['sentences'][0]['tokens']
        for k in range(len(y)):
            pos_dict[y[k]['word']] = y[k]['pos']
            if 'NNP' in y[k]['pos']:
                proper_names.append(y[k]['word'])

        depend_dict[i] = depend_list
        depend_list = []

        if len(compound_list) > 0:
            w = compound_word(compound_list)
        else:
            w = []
        for jj in range(len(w)):
            if w[jj] != '':
                print w[jj]
                compound_word_list.append(w[jj])

    print '--------NAMES------' + str(proper_names)
    print '--------NEGATIVE----' + str(neg_words)
    return depend_dict,pos_dict,proper_names