def splitCol8toWords(): 
    ## using corenlp to do split up job
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    
    ## load dataset
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t:
        trainTup = json.load(t)

    fres = open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.txt','w')

    split_res = []
    for num,tup in enumerate(trainTup):
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]['words']
        temp = []
        for s in slist:
            temp.append(s[0])
        split_res.append([tup[0],tup[1],tup[6],temp])
        fres.write(tup[0]+'\t'+tup[1]+'\t'+tup[6]+'\t'+','.join(temp)+'\n')
        print 'No.', num,tup[6]
        print tup[8]
        print [tup[0],tup[1],tup[6],temp]
    ## record new dataset
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json','w') as f:
        json.dump(split_res,f)
    fres.close()
Example #2
0
def stanford_parse(corpus_path):
    """Parse a directory (recursively) with the Stanford parser..."""
    import os
    import ast
    try:
        from corenlp import StanfordCoreNLP
    except:
        raise ValueError("CoreNLP not installed.")
    path_part, corpus_name = os.path.split(corpus_path)
    new_corpus_folder = 'parsed_%s' % corpus_name
    new_corpus_path = os.path.join(path_part, new_corpus_folder)
    if not os.path.exists(new_corpus_path):
        os.makedirs(new_corpus_path)
    corenlp = StanfordCoreNLP()
    files = os.listdir(corpus_path)
    for root, dirs, files in os.walk(corpus_path, topdown=True):
        for name in files:
            filepath = os.path.join(root, name)
            f = open(filepath)
            raw = f.read()
            parsed_text = ast.literal_eval(corenlp.parse(raw))
            for index, sent in enumerate(parsed_text['sentences']):
                syntax_tree = sent['parsetree']
                plain_text = sent['text']
            subcorpus_path = os.path.join(new_corpus_path, subcorpus_name)
            if not os.path.exists(subcorpus_path):
                os.makedirs(subcorpus_path)
def afterModifyCol8_splitCol8():
    col8_splitup = []
    with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t:
        trainTup = json.load(t)
    corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1"
    corenlp = StanfordCoreNLP(corenlp_dir)
    for num, tup in enumerate(trainTup):
        print 'No.',num
        print 'orin: ',tup[8]
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        print par
        slist =  par["sentences"][0]['words']
        # print slist
        temp = []
        for s in slist:
            temp.append(s[0])
        col8_splitup.append(temp)
        print temp
        ## check dependencies split
        dlist = par['sentences'][0]['dependencies']
        demp = []
        for d in dlist:
            demp.append(d)
        print demp
        if num == 4:
            break
def afterModifyCol8_splitCol8():
    col8_splitup = []
    with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t:
        trainTup = json.load(t)
    corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1"
    # corenlp_dir = "stanford-corenlp-full-2015-01-29"
    # corenlp_dir = "stanford-corenlp-full-2013-06-20"
    corenlp = StanfordCoreNLP(corenlp_dir)
    # res = corenlp.parse("Bell, a company which is based in LA, makes and distributes computer products. I hate you.")
    # par = json.loads(res)
    # for i in  par["sentences"][0]['dependencies']:
    #     print i
    for num, tup in enumerate([trainTup[1853]]):
        print 'No.',num
        print 'orin: ',tup[8]
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        # print par
        slist =  par["sentences"][0]['words']
        # print slist
        temp = []
        for s in slist:
            temp.append(s[0])
        col8_splitup.append(temp)
        print temp
        ## check dependencies split
        dlist = par['sentences'][0]['dependencies']
        demp = []
        for d in dlist:
            demp.append(d)
        print demp
        if num == 4:
            break
def f1f2f3f4f5f6f7(file_,file2_):
    ## using corenlp to do split up job
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    ## load dataset
    with open(file_) as t:
        trainTup = json.load(t)
    ## data structure to hold fea1 to fea7 a list
    feaLst = []
    for num,tup in enumerate(trainTup):
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        print "No. %d tup in processing.." % (num)
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        print tup[8]
        ## use corenlp to get lemma and pos
        for p,word in enumerate(par["sentences"][0]['words']):
            print str(p)+'th w in tupl '+str(num)
            
            
            tmp = {}
            tmp['Word'] = word[0]
            tmp['Lemma'] = word[1]['Lemma']
            tmp['POS'] = word[1]['PartOfSpeech']
            feaLst.append(tmp)
        ## add pos-1,pos+1,pos-2 and pos+2    
        slen = len(feaLst)
        for ind,val in enumerate(feaLst):
            if (ind-1) >= 0 and (ind-1) <= slen-1:
                val['POS-1'] = feaLst[ind-1]['POS']
            else:
                val['POS-1'] = "NA"

            if (ind+1) >= 0 and (ind+1) <= slen -1:
                val['POS+1'] = feaLst[ind+1]['POS']
            else:
                val['POS+1'] = "NA"

            if (ind-2) >= 0 and (ind-2) <= slen -1:
                val['POS-2'] = feaLst[ind-2]['POS']
            else:
                val['POS-2'] = "NA"

            if (ind+2) >=0 and (ind+2) <= slen -1:
                val['POS+2'] = feaLst[ind+2]['POS']
            else:
                val['POS+2'] = "NA"
        
    for i in feaLst:
        print 'w:',i['Word'],' lemma:',i['Lemma'],' pos-2:',i['POS-2'],' pos-1:',i['POS-1'],' pos:',i['POS'],' pos+1:',i['POS+1'],' pos+2:',i['POS+2']
        
    with open(file2_,'w') as o:
        json.dump(feaLst,o)
    print len(feaLst)
    print len(trainTup)
def corenlpLemmaPOS_stanfordparserDependency_split_equalChecking():
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess", version="3.4.1")
    os.environ["STANFORD_PARSER"] = "stanford-parser-full-2014-08-27/"
    os.environ["STANFORD_MODELS"] = "stanford-parser-full-2014-08-27/"
    parser = stanford.StanfordParser(
        model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    with open("../../dataclean_Nov8_2015/train_afterdataclean_modifiedcleanedTupleNov8.json") as t:
        trainTup = json.load(t)
    for num, tup in enumerate(trainTup):
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        ## use corenlp to split sentence
        print "No.", num
        print tup[8]
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]["words"]
        print slist
        temp = []
        for s in slist:
            temp.append(s[0])
        print temp
        ## use stanfordDependencies to do split sentence
        sentences = parser.raw_parse(tup[8])
        s = ""
        for line in sentences:
            for sentence in line:
                s += str(sentence)

        sent = sd.convert_tree(s)
        print sent
        detemp = []
        for t in sent:
            detemp.append(t[1])
        print detemp
        for di, ti in zip(detemp, temp):
            if di == ti:
                pass
            else:
                if (
                    (ti == "(" and di == "-LRB-")
                    or (ti == ")" and di == "-RRB-")
                    or (ti == "[" and di == "-LSB-")
                    or (ti == "]" and di == "-RSB-")
                ):
                    print "diff in parenthesis"
                    pass
                else:
                    print "{", di, " ,", ti, " }"
def checkCoreNLPSplit_DependencySplit(file_):
    with open(file_) as f:
        tset = json.load(f)
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1')
    os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/'
    os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/'
    parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    
    for num, tup in enumerate(tset):
        print num
        if not tup[8]:
            continue
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]['words']
        temp = []
        for s in slist:
            temp.append(s[0])
        
        ## use stanfordDependencies to do split sentence
        sentences = parser.raw_parse(tup[8])
        s=""
        for line in sentences:
            for sentence in line:
                s+=str(sentence)

        sent = sd.convert_tree(s)
        
        detemp = []
        for t in sent:
            detemp.append(t[1])
            
        ## check if same

        for di,ti in zip(detemp,temp):
            if di == ti:
                pass
            else:
                if (ti == '(' and di == '-LRB-') or (ti == ')' and di == '-RRB-') or (ti == '[' and di == '-LSB-') or (ti == ']' and di == '-RSB-'):
                    print "diff in parenthesis"
                    pass
                else:
                    print "!!!"
                    print "{",di,' ,',ti," }"
def parse(sentence):

    from corenlp import StanfordCoreNLP

    parser = StanfordCoreNLP()

    data = parser.parse(sentence)
    #print data

    open_file = open("data.json", "wb")

    open_file.write(data)

    open_file.close()
def remove_tuples0MoreThan1BiasedWord_fromOriginalTuple():
    ## using corenlp to do split up job
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    
    ## load dataset
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t:
        trainTup = json.load(t)
    # with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_stripPuncNum_Dec9.json') as a:
        # verify = json.load(a)
    b =  open('../../devDataclean_Dec8_2015/dev_biasword0ormorethan1_modifiedFile_dec11.txt','w')

    res2 = []
    for num,tup in enumerate(trainTup):
        print num
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]['words']
        temp = []
        for s in slist:
            temp.append(s[0])

        ## count of biased word
        cnum = temp.count(tup[6])
        
        if cnum == 1:
            ## verify if the qualified sent is the same as the split col8 file: dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json
            # if (verify[num][2] == tup[6]) and (verify[num][0] == tup[0]):
            res2.append(tup)
            # else:
                # print "two file are diff"
                # print verify[num]
                # print tup
                # sys.exit()
        else:
            b.write(str(tup)+'\n') 
    with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_elimBiasWord0orMoreThanOne_fullTup_Dec11.json','w') as f:
        json.dump(res2,f)
    b.close()
Example #10
0
def stanford_parse(data, corpus_name = 'corpus'):
    from time import localtime, strftime
    thetime = strftime("%H:%M:%S", localtime())
    print "\n%s: Initialising CoreNLP... \n" % thetime
    import os
    import ast
    try:
        from corenlp import StanfordCoreNLP
    except:
        raise ValueError("CoreNLP not installed.")
    from corpkit.progressbar import ProgressBar
    corenlp = StanfordCoreNLP()
    if not os.path.exists(corpus_name):
        os.makedirs(corpus_name)
    p = ProgressBar(len(data))
    for index, datum in enumerate(data):
        p.animate(index)
        text = datum[0]
        metadata = datum[1]
        number_of_zeroes = len(str(len(data))) - 1
        filename = str(index).zfill(number_of_zeroes) + '.txt' 
        file_data = []
        parsed_text = ast.literal_eval(corenlp.parse(text))
        trees = []
        raw_texts = []
        for index, sent in enumerate(parsed_text['sentences']):
            syntax_tree = sent['parsetree']
            plain_text = sent['text']
            trees.append(syntax_tree)
            raw_texts.append(plain_text)
                    #subcorpus_path = os.path.join(new_corpus_path, subcorpus_name)
        file_data = ['<raw>' + '\n'.join(raw_texts) + '\n</raw>', '<parse>' + '\n'.join(trees) + '\n</parse>', ]
        if not os.path.exists(os.path.join(corpus_name, metadata)):
            os.makedirs(os.path.join(corpus_name, metadata))
        try:
            fo=open(os.path.join(corpus_name, metadata, filename),"w")
        except IOError:
            print "Error writing file."
        fo.write('\n'.join(file_data))
        fo.close()
    p.animate(len(data))
    print 'Done!'
Example #11
0
def parse_file():
    sentence_file = open('sentences.txt', 'w')
    dep_file = open('deps.txt', 'w')
    tree_file = open('trees.txt', 'w')
    abstracts = [line.strip() for line in open('relabs.txt', 'r')]
    corenlp = StanfordCoreNLP()
    for abstract in abstracts:
        parse = corenlp.parse(abstract)
        xml = json.loads(parse)
        sentences = xml['sentences']
        for sentence in sentences:
            # Write sentence
            sentence_file.write(sentence['text'] + "\n")
            # Write parse tree
            tree_file.write(sentence['parsetree'] + "\n")
            # Write dependencies
            for dep in sentence['dependencies']:
                dep_file.write('@'.join(dep) + "\t")
            dep_file.write("\n")
    dep_file.close()
    tree_file.close()
    sentence_file.close()
Example #12
0
def split_one(params):
    from corenlp import StanfordCoreNLP
    index, fn, start, end = params
    # skip 'start' line
    lno = 0
    fin = open(fn)
    while lno < start:
        fin.readline()
        lno += 1

    ret = []
    parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False)
    for i in xrange(start, end):
        line = fin.readline()

        ll = line.decode('utf8').strip().split('\t')
        """pay attention to here !!!"""
        # if len(ll) != 3:
        #     continue
        # if not ll[2].endswith('@en'):
        #     continue
        # text = ll[2][1:-4]
        if len(ll) != 2:
            continue
        text = ll[1]
        text = text.replace('\\n', ' ').replace('\\r', ' ')
        try:
            rsp = json.loads(parser.parse(text))

            sentences = []
            for s in rsp['sentences']:
                sentences.append(s['text'])
            ret.append(('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8'))
        except Exception as e:
            print e

    fin.close()
    return ret
Example #13
0
def local_split_description(fn_in, fn_out):
    from corenlp import StanfordCoreNLP
    parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False)
    with open(fn_out, 'w') as fout:
        with open(fn_in) as fin:
            for line in fin:
                ll = line.decode('utf8').strip().split('\t')
                if len(ll) != 3:
                    continue
                if not ll[2].endswith('@en'):
                    continue
                text = ll[2][1:-4]

                text = text.replace('\\n', ' ').replace('\\r', ' ')
                try:
                    rsp = json.loads(parser.parse(text))

                    sentences = []
                    for s in rsp['sentences']:
                        sentences.append(s['text'])

                    print >> fout, ('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8')
                except Exception as e:
                    print e.message
Example #14
0
class StoreParser():
    
    def __init__(self):
        self.corenlp = StanfordCoreNLP()
        self.treemanipulator = TreeManipulator()
        self.stemmer = Stemmer()
        self.blacklist = [line.strip() for line in open("blacklist.txt", 'r')]

    def parse_abstract(self, id):
        count_dict = dict()
        if id in self.blacklist:
            print 'Id {0} is blacklisted.'.format(id)
            return count_dict
        path = "NPs/" + id + ".nps"
        try:
            with open(path, 'r') as file:
                for line in file:
                    line = line.strip()
                    split = line.split("\t")
                    stemmed_np = split[0]
                    count = split[1]
                    unstemmed = dict()
                    for i in xrange(2,len(split),2):
                        unstemmed[split[i]] = int(split[i+1])
                    count_dict[stemmed_np] = [int(count), unstemmed]
        except IOError:
            try:
                print "Parsing " + str(id)
                try:
                    abstract = medeley_fetch.get_abstract_for_id(id)
                except Exception as e:
                    if e.args[0] == "TooManyRequestsException":
                        print "Skipping due to server overload, consider halting program..."
                    elif e.args[0] == "PaperHasNoAbstractException":
                        print "Object has no abstract, probably not a paper..."
                    else: 
                        print "Unknown exception occured when fetching paper..."
                    return count_dict
                # Can happen due to server overload, but apparently for other reasons as well
                parse = self.corenlp.parse(abstract)
                document = json.loads(parse)
                with open("Parses/" + id + ".json", 'w') as file:
                    file.write(parse)
                # Extract all the nps from the parse trees
                # TODO: Not that important, I guess
                for sentence in document['sentences']:
                    parse_tree = sentence['parsetree']
                    nltk_tree = Tree(parse_tree)
                        
                    nps = self.treemanipulator.get_all_np_variations(nltk_tree)
                    for original_np in nps:      
                        if original_np != "":
                            stemmed_np = self.stemmer.stem_string(original_np)
                            if stemmed_np in count_dict.keys():
                                count_dict[stemmed_np][0] += 1
                                count_dict[stemmed_np][1][original_np] += 1
                            else:
                                count_dict[stemmed_np] = [1, defaultdict(int)]
                                count_dict[stemmed_np][1][original_np] = 1
                with open(path, 'w') as file:
                    for key in count_dict.iterkeys():
                        file.write(str(key) + "\t" + str(count_dict[key][0]) + "\t")
                        for original_np in count_dict[key][1].iterkeys():
                            file.write(str(original_np) + "\t" + str(count_dict[key][1][original_np]) + "\t")
                        file.write("\n")
            except pexpect.ExceptionPexpect:
                print "Timeout during parsing. Verify that the content is rubbish, and add to the blacklist..."
                exit()
        return count_dict
Example #15
0
corenlp_dir = "stanford-corenlp-full-2014-08-27/"
#server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080),timeout=200.0))
server = StanfordCoreNLP(corenlp_dir)

orig_file = open('pubmed_1000.csv', 'r')
new_file = open('coref-1000.csv', 'w')
count = 0
gotdata = 1
result = []
for line in orig_file.readlines():
    cols = line.split('\t')
    message =  cols[2]

    simplemessage = "Stanford University is located in California. It is a great university."
    print "Sending line: " + str(count)
    data = server.parse(message)
    '''
    while not gotdata:
        try:
            print "Sending line: " + str(count)
            data = server.parse(message)
            gotdata = 1
        except:
            print "The connection got reseted."       
            break
    '''
    result = loads(data)
    #gotdata = 0
    if 'coref' in result.keys():
        for coref in result['coref']:
            for one in coref:
Example #16
0
# -*- coding: utf-8 -*-
from corenlp import StanfordCoreNLP
import time

local_corenlp_path = './tmp'

# Simple usage
nlp = StanfordCoreNLP(local_corenlp_path)

sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
print('Tokenize:', nlp.word_tokenize(sentence))
print('Part of Speech:', nlp.pos_tag(sentence))
print('Named Entities:', nlp.ner(sentence))
print('Constituency Parsing:', nlp.parse(sentence))
print('Dependency Parsing:', nlp.dependency_parse(sentence))

del nlp
time.sleep(10)
# Other human languages support, e.g. Chinese
nlp = StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False)

sentence = '清华大学位于北京。'
print(nlp.word_tokenize(sentence))
print(nlp.pos_tag(sentence))
print(nlp.ner(sentence))
print(nlp.parse(sentence))
print(nlp.dependency_parse(sentence))

del nlp
time.sleep(10)
# General Stanford CoreNLP API
import pdb

# Get the corpus-file open
corpusjson = 'protest.json'
jsonobject = json.load(codecs.open(corpusjson))

# Get and clean the text:
texts = (clean_html(parse(StringIO(obj[4].replace(
    "\n", " ")))).getroot().text_content() for obj in jsonobject)
print "Story text generator object created."

# Turn it into a string object, then an html object, then back into string...
#texts = clean_html(parse(StringIO(text))).getroot().text_content()

print "Setting up parser: "
# Set up the parser
stanford_parser = StanfordCoreNLP()

print "Creating parser generator object: "
# Parse dat.
parsed_texts = (stanford_parser.parse(unicode(text)) for text in texts)

# Save the result to a file
# Not sure how enumerate() works with generators; ostensibly a wrapper which
# retains laziness, but I don't wanna risk it and introduce more variables.
i = 0  # So, it's gross. Whatever.
for story in parsed_texts:
    i += 1
    with codecs.open(str(i) + ".json", 'w') as fh:
        json.dump(json.loads(story), fh, indent=2)
# Get the corpus-file open
corpusjson = 'protest.json'
jsonobject = json.load(codecs.open(corpusjson))


# Get and clean the text: 
texts = (clean_html(parse(StringIO(obj[4].replace("\n", " ")))).getroot().text_content() for obj in jsonobject)
print "Story text generator object created."


# Turn it into a string object, then an html object, then back into string...
#texts = clean_html(parse(StringIO(text))).getroot().text_content()

print "Setting up parser: "
# Set up the parser
stanford_parser = StanfordCoreNLP()

print "Creating parser generator object: "
# Parse dat. 
parsed_texts = (stanford_parser.parse(unicode(text)) for text in texts)

# Save the result to a file
# Not sure how enumerate() works with generators; ostensibly a wrapper which 
# retains laziness, but I don't wanna risk it and introduce more variables. 
i = 0       # So, it's gross. Whatever. 
for story in parsed_texts: 
    i += 1
    with codecs.open(str(i)+".json", 'w') as fh: 
        json.dump(json.loads(story), fh, indent=2)