def features(docList):
    import time

    # download model (only needs to be done once)
    model_dir = download_and_install_model('WSJ', '/tmp/models')
    # Loading the model is slow, but only needs to be done once
    rrp = RerankingParser.from_unified_model_dir(model_dir)
    rrp.set_parser_options(nbest=5)
    features = []
    scores = []
    with open("output_log.txt",
              "w") as logF, open("syn_feats.pkl",
                                 "w") as synFile, open("syn_scores.pkl",
                                                       "w") as scoresFile:

        for i, doc in enumerate(docList):
            start_time = time.time()

            features.append(defaultdict(float))
            scores.append(defaultdict(list))

            for sentence in doc:

                parses = rrp.parse(sentence, rerank=False)
                #print(len(parses))
                #print(sentence, file = logF)
                try:
                    parse_score = parses[0].parser_score
                    rerank_score = parses[0].reranker_score
                    scores[i]['parse'].append(parse_score)
                    scores[i]['rerank'].append(rerank_score)
                    scores[i]['sent_length'].append(
                        len(parses[0].ptb_parse.tokens()))

                    best_parse = parses[0].ptb_parse
                    # print(best_parse, file = logF)

                    for t in best_parse.all_subtrees():
                        levels = buildSubtrees(t)
                        for l in levels:
                            features[i][l] += 1.0
                except:
                    print("No parse available - skipping")
            features[i] = {x: v for x, v in features[i].items()}
            print("{0}".format(
                sorted(features[i].items(),
                       key=operator.itemgetter(1),
                       reverse=True)),
                  file=logF)
            print("--- {0} seconds for {1} sentences ---".format(
                time.time() - start_time, len(doc)))

        pickle.dump(features, synFile)
        pickle.dump(scores, scoresFile)


#     t_bllip = Timer(lambda: rrp.parse(sentence))
#     print ("bllip", t_bllip.timeit(number=5))

    pass
예제 #2
0
파일: depparser.py 프로젝트: chao-su/camr
    def parse(self, sent_filename):
        """
        use Charniak parser to parse sentences then convert results to Stanford Dependency
        """
        from bllipparser.ModelFetcher import download_and_install_model
        from bllipparser import RerankingParser
        #path_to_model = './bllip-parser/models/WSJ+Gigaword'
        #if not.path.exists(path_to_model):
        model_type = 'WSJ+Gigaword'
        path_to_model = download_and_install_model(model_type,
                                                   './bllip-parser/models')
        print "Loading Charniak parser model: %s ..." % (model_type)
        rrp = RerankingParser.from_unified_model_dir(path_to_model)
        print "Begin Charniak parsing ..."
        parsed_filename = sent_filename + '.charniak.parse'
        parsed_trees = ''
        lineno = 0
        with open(sent_filename, 'r') as f, open(parsed_filename, 'w') as of:
            for l in f:
                lineno += 1
                print >> logs, 'lineno %s, %s' % (lineno, l)
                parsed_trees = rrp.simple_parse(l.strip().split())
                parsed_trees += '\n'
                of.write(parsed_trees)

        # convert parse tree to dependency tree
        print "Convert Charniak parse tree to Stanford Dependency tree ..."
        subprocess.call('./scripts/stdconvert.sh ' + parsed_filename,
                        shell=True)
예제 #3
0
    def parse(self,sent_filename):
        """
        use Charniak parser to parse sentences then convert results to Stanford Dependency
        """
        from bllipparser.ModelFetcher import download_and_install_model
        from bllipparser import RerankingParser
        #path_to_model = './bllip-parser/models/WSJ+Gigaword'
        #if not.path.exists(path_to_model):
        model_type = 'WSJ+Gigaword'
        path_to_model = download_and_install_model(model_type,'./bllip-parser/models')
        print "Loading Charniak parser model: %s ..." % (model_type)
        rrp = RerankingParser.from_unified_model_dir(path_to_model)
        print "Begin Charniak parsing ..."
        parsed_filename = sent_filename+'.charniak.parse'
        parsed_trees = ''
        with open(sent_filename,'r') as f:
            for l in f:
                parsed_trees += rrp.simple_parse(l.strip().split())
                parsed_trees += '\n'

        with open(parsed_filename,'w') as of:
            of.write(parsed_trees)
                

        # convert parse tree to dependency tree
        print "Convert Charniak parse tree to Stanford Dependency tree ..."
        subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
예제 #4
0
 def test_parse(self):
     path = bllip_wrapper.init_model()
     rrp = RerankingParser.from_unified_model_dir(path)
     tree = bllip_wrapper.parse(rrp, 'hello world!')
     self.assertIsNotNone(tree)
     print tree.ptb_parse
     self.assertEqual(str(tree.ptb_parse), '(S1 (S (NP (NN hello) (NN world) (NN !))))')
 def __init__(self, papers, presentations):
     self.papers = papers
     self.presentations = presentations
     self.train_features, self.vectorizer = self.createVectorizer(
         papers, presentations)
     model_dir = find('models/bllip_wsj_no_aux').path
     self.parser = RerankingParser.from_unified_model_dir(model_dir)
예제 #6
0
파일: depparser.py 프로젝트: didzis/CAMR
 def __init__(self):
     if CharniakParser.parser is None:
         from bllipparser.ModelFetcher import download_and_install_model
         from bllipparser import RerankingParser
         model_type = 'WSJ+Gigaword'
         path_to_model = download_and_install_model(model_type,'./bllip-parser/models')
         print "Loading Charniak parser model: %s ..." % (model_type)
         CharniakParser.parser = RerankingParser.from_unified_model_dir(path_to_model)
예제 #7
0
    def __init__(self, model_dir=None):
        if model_dir is None:
            logging.debug("downloading GENIA+PubMed model if necessary ...")
            model_dir = ModelFetcher.download_and_install_model(
                'GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models'))
        self.model_dir = os.path.expanduser(model_dir)

        logging.debug('loading model %s ...', self.model_dir)
        self.rrp = RerankingParser.from_unified_model_dir(self.model_dir)
예제 #8
0
파일: parse.py 프로젝트: alistairewj/NegBio
    def __init__(self, model_dir=None):
        if model_dir is None:
            logging.debug("downloading GENIA+PubMed model if necessary ...")
            model_dir = ModelFetcher.download_and_install_model(
                'GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models'))
        elif 'pathlib' in str(type(model_dir)):
            # avoid python 2/3 compatibility issues with os/pathlib2
            model_dir = str(model_dir)
        self.model_dir = os.path.expanduser(model_dir)

        logging.debug('loading model %s ...' % self.model_dir)
        self.rrp = RerankingParser.from_unified_model_dir(self.model_dir)
def features(docList):
    import time

    
    # download model (only needs to be done once)
    model_dir = download_and_install_model('WSJ', '/tmp/models')
    # Loading the model is slow, but only needs to be done once
    rrp = RerankingParser.from_unified_model_dir(model_dir)
    rrp.set_parser_options(nbest = 5)
    features = []
    scores = []
    with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w")  as synFile, open("syn_scores.pkl", "w")  as scoresFile:

        for i, doc in enumerate(docList):
            start_time = time.time()

            features.append(defaultdict(float))
            scores.append(defaultdict(list))

            for sentence in doc:
                
                parses = rrp.parse(sentence, rerank=False)
                #print(len(parses))
                #print(sentence, file = logF)
                try:
                    parse_score = parses[0].parser_score
                    rerank_score = parses[0].reranker_score
                    scores[i]['parse'].append(parse_score)
                    scores[i]['rerank'].append(rerank_score)
                    scores[i]['sent_length'].append(len(parses[0].ptb_parse.tokens()))
    
                    best_parse = parses[0].ptb_parse
                    # print(best_parse, file = logF)
                
                    for t in best_parse.all_subtrees():
                        levels = buildSubtrees(t)
                        for l in levels:
                            features[i][l] += 1.0
                except:
                    print("No parse available - skipping")
            features[i] = {x:v for x,v in features[i].items()}
            print("{0}".format(sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file = logF)
            print("--- {0} seconds for {1} sentences ---" .format(time.time() - start_time, len(doc)))

        pickle.dump(features, synFile)
        pickle.dump(scores, scoresFile)


#     t_bllip = Timer(lambda: rrp.parse(sentence))
#     print ("bllip", t_bllip.timeit(number=5))
    
    pass
예제 #10
0
def main():
    #   model_dir = download_and_install_model('WSJ', '/tmp/models')
    model_dir = download_and_install_model('WSJ+Gigaword-v2', '/tmp/models')
    parser = RerankingParser.from_unified_model_dir(model_dir)
    goodArticles = []
    badArticles = []
    articles = importArticles('trainingSet.dat')
    labels = getFakeGood('trainingSetLabels.dat')
    fg = open('goodArticlesBllip.txt', 'w')
    fb = open('badArticlesBllip.txt', 'w')
    i = 0
    for label in labels:
        if label == 1:
            goodArticles.append(articles[i])
            articleScores = []
            for sentence in articles[i]:
                logging.debug("Looking into good sentence: %s" % sentence)
                sentenceParses = parser.parse(sentence, 1)
                sentenceBestScore = sentenceParses[0].parser_score
                logging.debug("Score for good sentence: %s" %
                              sentenceBestScore)
                articleScores.append(sentenceBestScore)
            sum = 0
            for a in articleScores:
                a = float(a)
                sum = sum + a
            averageScore = sum / len(articleScores)
            fg.write("%s, %s, %f\n" %
                     (articles[i], articleScores, averageScore))
        if label == 0:
            badArticles.append(articles[i])
            articleScores = []
            for sentence in articles[i]:
                logging.debug("Looking into bad sentence: %s" % sentence)
                sentenceParses = parser.parse(sentence, 1)
                sentenceBestScore = sentenceParses[0].parser_score
                logging.debug("Score for bad sentence: %s" % sentenceBestScore)
                articleScores.append(sentenceBestScore)
            sum = 0
            for a in articleScores:
                a = float(a)
                sum = sum + a
            averageScore = sum / len(articleScores)
            fb.write("%s, %s, %f\n" %
                     (articles[i], articleScores, averageScore))
        i = i + 1
    fg.close()
    fb.close()
예제 #11
0
def tokenize(sentence):
    rrp = RerankingParser.from_unified_model_dir(
        '/Users/pranavipotharaju/.local/share/bllipparser')

    sentence = sentence.rstrip("</s>")

    # if len(sentence) >= 399:
    #     words = nltk.word_tokenize(sentence)
    #     pos_tags = nltk.pos_tag(words)
    # else:
    #     pos_tags = rrp.tag(sentence)
    try:
        pos_tags = rrp.tag(sentence)
    except Exception as e:
        print 'blaaa'
        words = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(words)
        print str(e)
    words, tags = zip(*pos_tags)
    tags = ' '.join(tags)
    return tags + '\n'
def calculate_score(article):
    sc = 0
    lc = 0
    rrp = RerankingParser.from_unified_model_dir(
        '/Users/anushreekumar/.local/share/bllipparser')
    #    print "~~~~~~~ Article ~~~~~~~~~~~"
    for sent in article.allSentences:
        #    print sent.string
        try:
            best_list = rrp.parse(sent.string)
            score = best_list[0].parser_score
        except Exception as e:
            score = -1000
            print str(e)
        sc += score * sent.length
        lc += sent.length
    #    sent_score.append(sc)
    #    print sent.length,score/float(sent.length),article.label

#    print "Grammaticality score of ",article.label," article : ",sc/float(lc)
    return sc / float(lc)
예제 #13
0
파일: parser.py 프로젝트: leebird/legonlp
 def __init__(self, biomodel):
     self.parser = RerankingParser.from_unified_model_dir(biomodel.encode('utf-8'))
예제 #14
0
from nltk.data import find
from bllipparser import RerankingParser
import freq
import sys
import re
################## Forming a parse tree ##################
def textToWords(text):
	return re.findall(r'\w+', text.lower())
sentence=open(sys.argv[1]).read()
model_dir = find('models/bllip_wsj_no_aux').path
parser = RerankingParser.from_unified_model_dir(model_dir)
#{'language': 'En', 'case_insensitive': False, 'nbest': 5, 'small_corpus': True, 'overparsing': 21, 'debug': 0, 'smooth_pos': 0}
parser.set_parser_options(case_insensitive=True)
l = parser.parse(sentence)
Trees=[1,2]
Trees[0]=l.get_reranker_best().ptb_parse
Trees[1]=l.get_parser_best().ptb_parse
S1=sentence
S1=S1.split()
sentence=textToWords(sentence)
################# finding all proper nouns in a sentence #################
def sortSecond(val): 
    return val[1]
def get_phrase(index,word):
	ans=word
	if(index!=0):
		ans=sentence[index-1]+" "+ans
	if(index!=len(sentence)-1):
		ans=ans+" "+sentence[index+1]
	return ans
def get_phrase1(index,word):
예제 #15
0
def main(sentence):

    model_dir = find('models/bllip_wsj_no_aux').path
    parser = RerankingParser.from_unified_model_dir(model_dir)
    #{'language': 'En', 'case_insensitive': False, 'nbest': 5, 'small_corpus': True, 'overparsing': 21, 'debug': 0, 'smooth_pos': 0}
    parser.set_parser_options(case_insensitive=True)
    l = parser.parse(sentence)
    Trees = [1, 2]
    Trees[0] = l.get_reranker_best().ptb_parse
    Trees[1] = l.get_parser_best().ptb_parse

    synlist = []
    try:
        for x in range(2):
            synlist += find_syn(Trees[x], 1)[1]
    except:
        a = 0
    synlist = rem_dupl(synlist)
    synlist = list(map(lambda x: x[1], synlist))

    #print(synlist)

    #done to split puncts separately
    for i in puncts:
        sentence = sentence.replace(i, ' ' + i + ' ')
    # sentence = sentence.replace('\n', ' ')

    WORDS = sentence.split()
    # WORDS = list(map(lambda x: x.lower(), WORDS))

    #now WORDS = list of puncts and lower-cased words in arg text

    #print(WORDS)

    Dict = {}

    it = 1
    for w in synlist:

        while (WORDS[it - 1] != w):
            Dict[WORDS[it - 1]] = []
            it = it + 1

        if w in puncts:
            continue

        if w.lower() in iitb_lingo:
            print([it, w, [iitb_lingo[w.lower()]]])

        else:

            synonyms = []

            q = "https://api.datamuse.com/words?ml=" + w

            #building trigram such that adjacent words shouldn't be iitb lingo or punctuation
            if it > 1 and not (WORDS[it - 2] in iitb_lingo) and not (
                    WORDS[it - 2] in puncts):
                q = q + '&lc=' + WORDS[it - 2]
            if it < len(WORDS) and not (WORDS[it] in iitb_lingo) and not (
                    WORDS[it] in puncts):
                q = q + '&rc=' + WORDS[it]

            response = requests.get(q)
            l = response.json()

            for i in l:
                synonyms.append(i["word"])

            #phrase finder
            # freq = []

            # for i in synonyms:

            # 	phrase = i
            # 	if w > 0 and not(WORDS[w-1] in iitb_lingo) and not(WORDS[w-1] in puncts):
            # 		phrase = WORDS[w-1] + ' ' + phrase
            # 	if w < len(WORDS)-1 and not(WORDS[w+1] in iitb_lingo) and not(WORDS[w+1] in puncts):
            # 		phrase = phrase + ' ' + WORDS[w+1]

            # 	encoded_query = urllib.parse.quote(phrase)
            # 	params = {'corpus': 'eng-gb', 'query': encoded_query}
            # 	params = '&'.join('{}={}'.format(name, value) for name, value in params.items())

            # 	response = requests.get('https://api.phrasefinder.io/search?' + params)
            # 	assert response.status_code == 200

            # 	if len(response.json()["phrases"]) > 0:
            # 		freq.append(response.json()["phrases"][0]["mc"])
            # 	else:
            # 		freq.append(0)

            # zipped = list(zip(synonyms, freq))
            # zipped = sorted(zipped, key = lambda x: x[1], reverse = True)
            # res = []
            # for i in range(min(3,len(zipped))):
            # 	res.append(zipped[i][0])
            # print(res)

            #top 5 synonyms
            Dict[WORDS[it - 1]] = synonyms[:3]
            print([it, w, synonyms[:3]])
            it = it + 1

    return Dict
예제 #16
0
def train_svm(kernel_type):
    trainDF = fex.read_data('/home/baseline_AC/train_AC_combined_models.csv')
    testDF = fex.read_data(
        '/home/baseline_AC/test_AC_combined_models_duplicate_included.csv')
    model_dir = find('models/bllip_wsj_no_aux').path
    parser = RerankingParser.from_unified_model_dir(model_dir)

    #ctx = mx.gpu(0)
    #bert = BertEmbedding(ctx=ctx)

    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    #model = BertModel.from_pretrained('bert-base-uncased')

    #tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    #model = RobertaModel.from_pretrained('roberta-base')

    #tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    #model = XLNetModel.from_pretrained('xlnet-base-cased')

    model_path = '/home/AC_models_Argument_corpus/roberta/'
    model = ClassificationModel(
        'roberta',
        model_path,
        num_labels=4,
        args={"config": {
            "output_hidden_states": True
        }})

    #trainDF_x = fex.extract_features(trainDF,parser)
    #trainDF_x = fex.extract_features(trainDF, parser, tokenizer, model)
    trainDF_x = fex.extract_features(trainDF, parser, model)
    feature_train_x = fex.make_feature_vector(trainDF_x)

    #testDF_x = fex.extract_features(testDF,parser)
    #testDF_x = fex.extract_features(testDF, parser, tokenizer, model)
    testDF_x = fex.extract_features(testDF, parser, model)
    feature_test_x = fex.make_feature_vector(testDF_x)

    # label encode the target variable
    train_y = []
    test_y = []

    for index, row in trainDF.iterrows():
        if row['label'] == 'Claim':
            train_y.append(1)
        elif row['label'] == 'Premise':
            train_y.append(0)
        elif row['label'] == 'MajorClaim':
            train_y.append(3)
        else:
            train_y.append(2)

    for index, row in testDF.iterrows():
        if row['label'] == 'Claim':
            test_y.append(1)
        elif row['label'] == 'Premise':
            test_y.append(0)
        elif row['label'] == 'MajorClaim':
            test_y.append(3)
        else:
            test_y.append(2)

    #train_y = encoder.fit_transform(train_y)

    svmclassifier = svm.SVC(kernel=kernel_type)
    svmclassifier.fit(feature_train_x, train_y)
    filename = 'finalized_model_linear.sav'
    #joblib.dump(svmclassifier, filename)

    y_pred = svmclassifier.predict(feature_test_x)
    print("argument corpus results for test:")
    print(confusion_matrix(test_y, y_pred))
    print(classification_report(test_y, y_pred))

    print("two law set results for test:")
    testlawDF = fex.read_data(
        '/home/baseline_AC/test_judgement_AC_combined_models_duplicate_included.csv'
    )
    #testlawDF_x = fex.extract_features(testlawDF,parser)
    #testlawDF_x = fex.extract_features(testlawDF, parser, tokenizer, model)
    testlawDF_x = fex.extract_features(testlawDF, parser, model)
    feature_test_law = fex.make_feature_vector(testlawDF_x)

    test_y_law = []
    for index, row in testlawDF.iterrows():
        if row['label'] == 'Claim':
            test_y_law.append(1)
        elif row['label'] == 'Premise':
            test_y_law.append(0)
        elif row['label'] == 'MajorClaim':
            test_y_law.append(3)
        else:
            test_y_law.append(2)

    y_pred_2 = svmclassifier.predict(feature_test_law)
    print(confusion_matrix(test_y_law, y_pred_2))
    print(classification_report(test_y_law, y_pred_2))
    filename = 'finalized_model_svm_roberta_finetuned_embedding.sav'
    joblib.dump(svmclassifier, filename)
예제 #17
0
    parser.add_argument("-v", "--verbose", action="store_true", help="print debug information")
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    if not os.path.isfile(args.input):
        sys.stderr.write('Cannot find input file: %s\n' % args.input)
        sys.exit(2)

    logging.info('Input file: %s' % args.input)
    logging.info('Output file: %s' % args.output)
    return args.input, args.output


if __name__ == "__main__":
    inputfilename, outputfilename = parse_argv()

    model_dir = init_model()
    logging.info('loading model %s ...' % model_dir)
    rrp = RerankingParser.from_unified_model_dir(model_dir)

    collection = parse(inputfilename)
    collection.clear_infons()
    collection.infons['tool'] = 'Bllip'
    collection.infons['process'] = 'parse'

    parse_bioc(rrp, collection)

    collection.tobiocfile(outputfilename)
예제 #18
0
from bllipparser import RerankingParser

rrp = RerankingParser.from_unified_model_dir(
    '/home/kashefi/.local/share/bllipparser/WSJ-PTB3')
sentence = "In the 3rd level I would place my little brother in. because my little brother is a very greedy little bot he always wants something."

pcfg = rrp.simple_parse(sentence.split(' '))
pcfg = pcfg[4:len(pcfg) - 1]
print pcfg
'''
pcfg = rrp.simple_parse(sentence)
pcfg = pcfg[4:len(pcfg)-1]
print(pcfg)
'''
예제 #19
0
from bllipparser import RerankingParser as rrp
from nltk.parse.api import ParserI
from nltk.tree import Tree
from nltk.data import find

model_dir = find('models/bllip_wsj_no_aux').path
bllip = rrp.from_unified_model_dir(model_dir)

f = open("../Fragments_for_testing/text2", "r")
sentence = f.read()
all_parses = bllip.parse(sentence)

ptb = all_parses[0].ptb_parse
tree = Tree.fromstring(str(ptb))
tree.draw()
예제 #20
0
"""
Create the Semantic Representation
"""

sentenceList = [] # a list for the SemanticRepresentation objects

workPath = os.getcwd()
dependencyInputFile = workPath+'/senna/input.txt'
with open(dependencyInputFile) as f:
    dlines = f.readlines()
#Load model to parse PENN TreeBank
print 'Loading parsing model...'
# only for the first run (uncomment the following line):
# rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)
# when it is not the first run:
rrp = RerankingParser.from_unified_model_dir('/Users/evania/.local/share/bllipparser/WSJ-PTB3')
# Load model to parse PENN TreeBank - is finished
#Now try to parse the text:
print 'Parsing the dependency for the sentence(s)...'
len_dlines = len(dlines)
count_dlines = 1

for l in dlines:
    if l != '\n': # if not an empty line
        theDependencyResult = getDependency(l)
        theID = 0
        semList = []
        for token in theDependencyResult:
            #print token
            stringToken = str(token)
            sem = SemanticRepresentation()
예제 #21
0
import copy
import re
from bllipparser import RerankingParser
import itertools
import urllib
import pydot
import os
from bllipparser.ModelFetcher import download_and_install_model
import re

if not os.path.exists(  os.path.join( os.getcwd(), "bllip", "models", "WSJ")  ):
	print "Downloading the BLLIP model ... "
	download_and_install_model('WSJ', os.path.join( os.getcwd(), "bllip", "models") )
	print "Done Downloading."

rrp = RerankingParser.from_unified_model_dir('bllip/models/WSJ')


def get_svg(data):
	graphs = pydot.graph_from_dot_data( data )
	svg_string = graphs[0].create_svg()
	return svg_string

def get_fsm_code(list_of_sentences):
	global rrp
	list_of_sentences = map( lambda sentence: (str(sentence)).lower(), list_of_sentences)
	list_of_sentences = map( lambda sentence:  re.sub(r'\..*', "", sentence ), list_of_sentences)
	list_of_parsed_strings = map( lambda sentence: rrp.simple_parse(sentence) , list_of_sentences)
	list_of_codified_parse_strings = map( lambda parse_string: ParseForest.codify_parse_string(parse_string) , list_of_parsed_strings)
	list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string),  list_of_codified_parse_strings)
	# list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string),  list_of_parsed_strings)