def run(self): global rrp print "Reloading" rrp = RerankingParser() rrp.load_parser_model(os.path.join(os.path.dirname(__file__), '../../lib/bllip/DATA/EN')) rrp.load_reranker_model(os.path.join(os.path.dirname(__file__), '../../lib/bllip/models/ec50spfinal/features.gz'), os.path.join(os.path.dirname(__file__), '../../lib/bllip/models/ec50spfinal/cvlm-l1c10P1-weights.gz')) print "Done loading model"
def test_parse(self): path = bllip_wrapper.init_model() rrp = RerankingParser.from_unified_model_dir(path) tree = bllip_wrapper.parse(rrp, 'hello world!') self.assertIsNotNone(tree) print tree.ptb_parse self.assertEqual(str(tree.ptb_parse), '(S1 (S (NP (NN hello) (NN world) (NN !))))')
def parse(self,sent_filename): """ use Charniak parser to parse sentences then convert results to Stanford Dependency """ from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser #path_to_model = './bllip-parser/models/WSJ+Gigaword' #if not.path.exists(path_to_model): model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type,'./bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) rrp = RerankingParser.from_unified_model_dir(path_to_model) print "Begin Charniak parsing ..." parsed_filename = sent_filename+'.charniak.parse' parsed_trees = '' with open(sent_filename,'r') as f: for l in f: parsed_trees += rrp.simple_parse(l.strip().split()) parsed_trees += '\n' with open(parsed_filename,'w') as of: of.write(parsed_trees) # convert parse tree to dependency tree print "Convert Charniak parse tree to Stanford Dependency tree ..." subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
def load_biomodel(self): rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) for did in self.documents: for sentence in self.documents[did].sentences: sentence_text = [t.text for t in sentence.tokens] #echocall = Popen(["echo", sentence_text] , stdout=PIPE, stderr=PIPE) #nc_params = ["nc", "localhost", "4449"] #echocall.wait() #call = check_output(nc_params , shell=True, stdin=echocall.stdout) #res = call.communicate() #res = netcat("localhost", 4449, sentence_text) #print res.strip() #print res = rrp.parse(sentence_text) if len(res) > 0: print res[0].ptb_parse print sentence.parsetree print #print sentence.bio_parse = str(res[0].ptb_parse) else: print sentence_text print "no parse" sentence.bio_parse = sentence.parsetree print
def __init__(self): if CharniakParser.parser is None: from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type,'./bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) CharniakParser.parser = RerankingParser.from_unified_model_dir(path_to_model)
def __init__(self, nbest=10, overparsing=10, only_parse=False, stop_words=None): self.parser = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=False) """create a Reranking Parser from BllipParser""" self.parser.set_parser_options(nbest=nbest, overparsing=overparsing) """set parser options""" self.only_parse=only_parse """whether features should be used from the BllipParser""" self.stemmer = LancasterStemmer() """an instance of LancasterStemmer from NLTK""" self.stop_words = stop_words if self.stop_words is None: self.stop_words = stopwords.words('english')
def features(docList): import time # download model (only needs to be done once) model_dir = download_and_install_model('WSJ', '/tmp/models') # Loading the model is slow, but only needs to be done once rrp = RerankingParser.from_unified_model_dir(model_dir) rrp.set_parser_options(nbest = 5) features = [] scores = [] with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w") as synFile, open("syn_scores.pkl", "w") as scoresFile: for i, doc in enumerate(docList): start_time = time.time() features.append(defaultdict(float)) scores.append(defaultdict(list)) for sentence in doc: parses = rrp.parse(sentence, rerank=False) #print(len(parses)) #print(sentence, file = logF) try: parse_score = parses[0].parser_score rerank_score = parses[0].reranker_score scores[i]['parse'].append(parse_score) scores[i]['rerank'].append(rerank_score) scores[i]['sent_length'].append(len(parses[0].ptb_parse.tokens())) best_parse = parses[0].ptb_parse # print(best_parse, file = logF) for t in best_parse.all_subtrees(): levels = buildSubtrees(t) for l in levels: features[i][l] += 1.0 except: print("No parse available - skipping") features[i] = {x:v for x,v in features[i].items()} print("{0}".format(sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file = logF) print("--- {0} seconds for {1} sentences ---" .format(time.time() - start_time, len(doc))) pickle.dump(features, synFile) pickle.dump(scores, scoresFile) # t_bllip = Timer(lambda: rrp.parse(sentence)) # print ("bllip", t_bllip.timeit(number=5)) pass
def __init__( self, parser_model=None, reranker_features=None, reranker_weights=None, parser_options=None, reranker_options=None, ): """ Load a BLLIP Parser model from scratch. You'll typically want to use the ``from_unified_model_dir()`` class method to construct this object. :param parser_model: Path to parser model directory :type parser_model: str :param reranker_features: Path the reranker model's features file :type reranker_features: str :param reranker_weights: Path the reranker model's weights file :type reranker_weights: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) """ _ensure_bllip_import_or_error() parser_options = parser_options or {} reranker_options = reranker_options or {} self.rrp = RerankingParser() self.rrp.load_parser_model(parser_model, **parser_options) if reranker_features and reranker_weights: self.rrp.load_reranker_model( features_filename=reranker_features, weights_filename=reranker_weights, **reranker_options )
class BllipParser(ParserI): """ Interface for parsing with BLLIP Parser. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. """ def __init__( self, parser_model=None, reranker_features=None, reranker_weights=None, parser_options=None, reranker_options=None, ): """ Load a BLLIP Parser model from scratch. You'll typically want to use the ``from_unified_model_dir()`` class method to construct this object. :param parser_model: Path to parser model directory :type parser_model: str :param reranker_features: Path the reranker model's features file :type reranker_features: str :param reranker_weights: Path the reranker model's weights file :type reranker_weights: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) """ _ensure_bllip_import_or_error() parser_options = parser_options or {} reranker_options = reranker_options or {} self.rrp = RerankingParser() self.rrp.load_parser_model(parser_model, **parser_options) if reranker_features and reranker_weights: self.rrp.load_reranker_model( features_filename=reranker_features, weights_filename=reranker_weights, **reranker_options ) def parse(self, sentence): """ Use BLLIP Parser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this BLLIP Parser instance's tagger. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: The sentence to be parsed :type sentence: list(str) :rtype: iter(Tree) """ _ensure_ascii(sentence) nbest_list = self.rrp.parse(sentence) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) def tagged_parse(self, word_and_tag_pairs): """ Use BLLIP to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. BLLIP will attempt to use the tags provided but may use others if it can't come up with a complete parse subject to those constraints. You may also specify a tag as ``None`` to leave a token's tag unconstrained. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: Input sentence to parse as (word, tag) pairs :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ words = [] tag_map = {} for i, (word, tag) in enumerate(word_and_tag_pairs): words.append(word) if tag is not None: tag_map[i] = tag _ensure_ascii(words) nbest_list = self.rrp.parse_tagged(words, tag_map) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) @classmethod def from_unified_model_dir( cls, model_dir, parser_options=None, reranker_options=None ): """ Create a ``BllipParser`` object from a unified parsing model directory. Unified parsing model directories are a standardized way of storing BLLIP parser and reranker models together on disk. See ``bllipparser.RerankingParser.get_unified_model_parameters()`` for more information about unified model directories. :return: A ``BllipParser`` object using the parser and reranker models in the model directory. :param model_dir: Path to the unified model directory. :type model_dir: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) :rtype: BllipParser """ ( parser_model_dir, reranker_features_filename, reranker_weights_filename, ) = get_unified_model_parameters(model_dir) return cls( parser_model_dir, reranker_features_filename, reranker_weights_filename, parser_options, reranker_options, )
class BllipParser(ParserI): """ Interface for parsing with BLLIP Parser. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. """ def __init__( self, parser_model=None, reranker_features=None, reranker_weights=None, parser_options=None, reranker_options=None, ): """ Load a BLLIP Parser model from scratch. You'll typically want to use the ``from_unified_model_dir()`` class method to construct this object. :param parser_model: Path to parser model directory :type parser_model: str :param reranker_features: Path the reranker model's features file :type reranker_features: str :param reranker_weights: Path the reranker model's weights file :type reranker_weights: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) """ _ensure_bllip_import_or_error() parser_options = parser_options or {} reranker_options = reranker_options or {} self.rrp = RerankingParser() self.rrp.load_parser_model(parser_model, **parser_options) if reranker_features and reranker_weights: self.rrp.load_reranker_model(features_filename=reranker_features, weights_filename=reranker_weights, **reranker_options) def parse(self, sentence): """ Use BLLIP Parser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this BLLIP Parser instance's tagger. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: The sentence to be parsed :type sentence: list(str) :rtype: iter(Tree) """ _ensure_ascii(sentence) nbest_list = self.rrp.parse(sentence) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) def tagged_parse(self, word_and_tag_pairs): """ Use BLLIP to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. BLLIP will attempt to use the tags provided but may use others if it can't come up with a complete parse subject to those constraints. You may also specify a tag as ``None`` to leave a token's tag unconstrained. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: Input sentence to parse as (word, tag) pairs :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ words = [] tag_map = {} for i, (word, tag) in enumerate(word_and_tag_pairs): words.append(word) if tag is not None: tag_map[i] = tag _ensure_ascii(words) nbest_list = self.rrp.parse_tagged(words, tag_map) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) @classmethod def from_unified_model_dir(cls, model_dir, parser_options=None, reranker_options=None): """ Create a ``BllipParser`` object from a unified parsing model directory. Unified parsing model directories are a standardized way of storing BLLIP parser and reranker models together on disk. See ``bllipparser.RerankingParser.get_unified_model_parameters()`` for more information about unified model directories. :return: A ``BllipParser`` object using the parser and reranker models in the model directory. :param model_dir: Path to the unified model directory. :type model_dir: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) :rtype: BllipParser """ ( parser_model_dir, reranker_features_filename, reranker_weights_filename, ) = get_unified_model_parameters(model_dir) return cls( parser_model_dir, reranker_features_filename, reranker_weights_filename, parser_options, reranker_options, )
import bllipparser from bllipparser import RerankingParser from nltk import Tree from practnlptools.tools import Annotator score = 0 annotator = Annotator() rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True) query = "Describe steps taken and worldwide reaction prior to introduction of the Euro on January 1, 1999." candidate = "Europe's new currency, the euro, will rival the U.S. dollar as an international currency over the long term, Der Speigel magazine reported Sunday." qListOfDict = annotator.getAnnotations(query)['srl'] cListOfDict = annotator.getAnnotations(candidate)['srl'] qParsed = ['(S1 '] cParsed = ['(S1 '] for list in qListOfDict:
parser.add_argument("-v", "--verbose", action="store_true", help="print debug information") args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) if not os.path.isfile(args.input): sys.stderr.write('Cannot find input file: %s\n' % args.input) sys.exit(2) logging.info('Input file: %s' % args.input) logging.info('Output file: %s' % args.output) return args.input, args.output if __name__ == "__main__": inputfilename, outputfilename = parse_argv() model_dir = init_model() logging.info('loading model %s ...' % model_dir) rrp = RerankingParser.from_unified_model_dir(model_dir) collection = parse(inputfilename) collection.clear_infons() collection.infons['tool'] = 'Bllip' collection.infons['process'] = 'parse' parse_bioc(rrp, collection) collection.tobiocfile(outputfilename)
except: strJsonObj = '{}' dictTotal = {} traceback.print_exc() return dictTotal fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopMixVersion = fopRoot + 'step4_mixCode/' fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' fpPseudocodeBeforePOS = fopRoot + 'pseudocode_before_pos.txt' fpCachedFilePath = fopRoot + 'cachedFilePaths.txt' createDirIfNotExist(fopMixVersion) model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) print('before traverse') lstFpJsonFiles = [] if not os.path.isfile(fpCachedFilePath): lstFop1 = sorted(glob.glob(fopMixVersion + '*/')) for fop1 in lstFop1: lstFop2 = sorted(glob.glob(fop1 + '*/')) for fop2 in lstFop2: lstFop3 = sorted(glob.glob(fop2 + 'v_*_label.txt')) # print(fp3) for fp3 in lstFop3: lstFpJsonFiles.append(fp3) print('end {}'.format(fop1)) # sorted(glob.glob(fopMixVersion+'**/**/a_json.txt')) print('after {} '.format(len(lstFpJsonFiles)))
print 'e.g: python parse.py paraphrases.csv 0 bllip' sys.exit(0) if sys.argv[3] == 'bllip': # bllip parser = '/pro/dpg/dc65/models/WSJ+QB' print 'basic:', parser elif sys.argv[3] == 'self': # self-trained parser = '/pro/dpg/dc65/models/WSJ+Gigaword' print 'self-trained:', parser else: print 'parser options: bllip, self' sys.exit(0) rrp = RerankingParser() rrp.load_parser_model(parser + '/parser') print 'reranker: /pro/dpg/dc65/models/WSJ/' rrp.load_reranker_model('/pro/dpg/dc65/models/WSJ/reranker/features.gz', '/pro/dpg/dc65/models/WSJ/reranker/weights.gz') mode = int(sys.argv[2]) # 0: gold, 1: 1best, 2: nbest f = open('tmp/trees', 'w') if mode == 2: g = open('tmp/scores', 'w') with open(sys.argv[1], 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') iter = 0 for row in reader: iter += 1 if iter % 3 == 1: if mode == 0:
from bllipparser import RerankingParser, tokenize print "start loading model..." rrp = RerankingParser.load_unified_model_dir('/home/yukang/selftrained') print "finish loading model" inputfile = "wsjtest" outputfile = "wsjtest.reparse" count = 0 data = open(inputfile) output = open(outputfile, 'w') sentence = [] for line in data: if len(line.split())==0: if len(sentence)==0: continue count+=1 print "start solving", count #last line of the file must be a blank line to terminate the last sentence. l = [word[0].replace("(", "-LRB-").replace(")", "-RRB-") for word in sentence] ans = rrp.parse(l) output.write(str(ans[0].ptb_parse)+"\n") # if count > 1: # break sentence = [] else: parts = line.split() sentence.append(parts) output.close()
import copy import re from bllipparser import RerankingParser import itertools import urllib import pydot import os from bllipparser.ModelFetcher import download_and_install_model import re if not os.path.exists( os.path.join( os.getcwd(), "bllip", "models", "WSJ") ): print "Downloading the BLLIP model ... " download_and_install_model('WSJ', os.path.join( os.getcwd(), "bllip", "models") ) print "Done Downloading." rrp = RerankingParser.from_unified_model_dir('bllip/models/WSJ') def get_svg(data): graphs = pydot.graph_from_dot_data( data ) svg_string = graphs[0].create_svg() return svg_string def get_fsm_code(list_of_sentences): global rrp list_of_sentences = map( lambda sentence: (str(sentence)).lower(), list_of_sentences) list_of_sentences = map( lambda sentence: re.sub(r'\..*', "", sentence ), list_of_sentences) list_of_parsed_strings = map( lambda sentence: rrp.simple_parse(sentence) , list_of_sentences) list_of_codified_parse_strings = map( lambda parse_string: ParseForest.codify_parse_string(parse_string) , list_of_parsed_strings) list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string), list_of_codified_parse_strings) # list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string), list_of_parsed_strings)
mapping[subtree.span()] = subtree.label for span, allowed_labels in constraints.items(): if mapping.get(span) not in allowed_labels: return False return True nbest_list = rrp.parse(sentence) for item in nbest_list: if consistent(item.ptb_parse, constraints): return item.ptb_parse else: return None if __name__ == "__main__": # this needs to be run from the root of the repository since it has # a relative path to the parsing model from bllipparser import RerankingParser rrp = RerankingParser() rrp.load_parser_model('first-stage/DATA/EN') # the constraint means: there must be a VP from [1,5) # (i.e., left ... Falklands) # this encourages the parser to pick "left" as the main verb constraints = {(1, 5): ['VP']} print parse_constrained(rrp, 'British left waffles on Falklands .'.split(), constraints) # if we parse without constraints, we get that the main verb is "waffles" print parse_constrained(rrp, 'British left waffles on Falklands .'.split(), {})
from bllipparser import RerankingParser RerankingParser.fetch_and_load('GENIA+PubMed')
for span, allowed_labels in constraints.items(): if mapping.get(span) not in allowed_labels: return False return True nbest_list = rrp.parse(sentence) for item in nbest_list: if consistent(item.ptb_parse, constraints): return item.ptb_parse else: return None if __name__ == "__main__": # this needs to be run from the root of the repository since it has # a relative path to the parsing model from bllipparser import RerankingParser rrp = RerankingParser() rrp.load_parser_model('first-stage/DATA/EN') # the constraint means: there must be a VP from [1,5) # (i.e., left ... Falklands) # this encourages the parser to pick "left" as the main verb constraints = {(1, 5): ['VP']} print parse_constrained(rrp, 'British left waffles on Falklands .'.split(), constraints) # if we parse without constraints, we get that the main verb is "waffles" print parse_constrained(rrp, 'British left waffles on Falklands .'.split(), {})
def main(transcript): # results = {"0": "1.0", "1": "0.9747", # "2": "0.968", "3": "0.8859", "4": "0.7071"} # print(json.dumps(results)) results = {} sentences = sent_tokenize(transcript) ''' Declaration of constants and functions ''' CONS_SATIRIC = 0 CONS_RELIABLE = 1 rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False) foo = TripletExtraction() bar = SemanticSimilarityAnalysis() ''' 2 database tables for comparison of input ''' c.execute('SELECT title FROM reliable_news') reliable_news = [tup[0] for tup in c.fetchall()] c.execute('SELECT title FROM satirical_news') satirical_news = [tup[0] for tup in c.fetchall()] t = len(sentences) correct_classifications = 0 for i in range(t): max_similarity = 0 classification = -1 max_sentence = "" inp = sentences[i] ''' generates the tree and gets the SVO of the input sentence ''' tree_inp = Tree(rrp.simple_parse(inp)) svo_inp = foo.getSVO(tree_inp[0]) ''' comparison for satirical and reliable news ''' for title in satirical_news: for subj in svo_inp['subject']: if subj[2] == 0: continue words = [x.lower() for x in sentence_tokenizer.tokenize(title)] if subj[0] in words or singularize(subj[0]) in words: tree_data = Tree(rrp.simple_parse(title)) svo_data = foo.getSVO(tree_data[0]) similarity_score1 = bar.get_similarities(svo_inp, svo_data) ''' object and subject swapped to provde more possible comparisons ''' svo_data['subject'], svo_data['object'] = svo_data[ 'object'], svo_data['subject'] similarity_score2 = bar.get_similarities(svo_inp, svo_data) similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2 if similarity_score > max_similarity: classification = 0 max_similarity = similarity_score max_sentence = title break for title in reliable_news: for sht in satiric_shits: title = title.replace(sht, "") for subj in svo_inp['subject']: if subj[2] == 0: continue words = [x.lower() for x in sentence_tokenizer.tokenize(title)] if subj[0] in words or singularize(subj[0]) in words: tree_data = Tree(rrp.simple_parse(title)) svo_data = foo.getSVO(tree_data[0]) similarity_score1 = bar.get_similarities(svo_inp, svo_data) ''' object and subject swapped to provde more possible comparisons ''' svo_data['subject'], svo_data['object'] = svo_data[ 'object'], svo_data['subject'] similarity_score2 = bar.get_similarities(svo_inp, svo_data) similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2 if similarity_score > max_similarity: classification = 1 max_similarity = similarity_score max_sentence = title break if classification == CONS_RELIABLE: results[str(i)] = str(round(max_similarity, 4)) elif classification == CONS_SATIRIC: results[str(i)] = str(round(-max_similarity, 4)) else: results[str(i)] = "0" print(json.dumps(results))
def __init__(self, model="WSJ-PTB3"): super().__init__(RerankingParser.fetch_and_load(model, verbose=True))
from bllipparser import RerankingParser from kasami import TreeScorer from kasami.normalizers import bllip # Loading WSJ-PTB3 treebank into bllip's RerankingParser bllip_rrp = RerankingParser.fetch_and_load('WSJ-PTB3') bllip_parse = lambda s: bllip.normalize_tree(bllip_rrp.parse(s)[0].ptb_parse) tree = bllip_parse("I am a little teapot") print(tree) print(tree.format(depth=1)) for production in tree: print(str(production)) sentences = [ "I am a little teapot", "Here is my handle", "Here is my spout", "When I get all steamed up I just shout tip me over and pour me out", "I am a very special pot", "It is true", "Here is an example of what I can do", "I can turn my handle into a spout", "Tip me over and pour me out" ] teapot_grammar = TreeScorer.from_tree_bank(bllip_parse(s) for s in sentences) teapot_grammar.score(bllip_parse("Here is a little teapot")) teapot_grammar.score(bllip_parse("It is my handle")) teapot_grammar.score(bllip_parse("I am a spout")) teapot_grammar.score(bllip_parse("Your teapot is gay")) teapot_grammar.score(bllip_parse("Your mom's teapot is asldasnldansldal"))
from bllipparser import RerankingParser rrp = RerankingParser.from_unified_model_dir( '/home/kashefi/.local/share/bllipparser/WSJ-PTB3') sentence = "In the 3rd level I would place my little brother in. because my little brother is a very greedy little bot he always wants something." pcfg = rrp.simple_parse(sentence.split(' ')) pcfg = pcfg[4:len(pcfg) - 1] print pcfg ''' pcfg = rrp.simple_parse(sentence) pcfg = pcfg[4:len(pcfg)-1] print(pcfg) '''
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. if __name__ == "__main__": # needs to be run from the root of the repository for the parser # model path below to work from bllipparser import RerankingParser, Tree rrp = RerankingParser() rrp.load_parser_model('first-stage/DATA/EN', heads_only=True) tree1 = Tree('''(S1 (SQ (VBZ Swears) (NP (PRP she)) (VP (VBD recognized) (NP (PRP$ his) (NN voice)) (, ,) (SBAR (IN that) (S (NP (NNP Tim)) (VP (VBD fired)))) (, ,) ('' ') (S (S (NP (PRP It)) (VP (VBZ 's) (NP (PRP$ my) (NN money)))) (CC and) (S (NP (PRP I)) (VP (VBP want) (S (NP (PRP it)) (VP (POS '))))))) (. !)))''') head = tree1.head() print 'head word of sentence:', head.token print 'head tree of sentence:', head print # print all syntactic dependencies for goveror, dependent in tree1.dependencies():
def __enter__(self): self.bllip = RerankingParser.fetch_and_load(self.model_name, verbose=True) return self
# Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. if __name__ == "__main__": # needs to be run from the root of the repository from bllipparser import RerankingParser, Tree rrp = RerankingParser() rrp.load_parser_model('first-stage/DATA/EN', terms_only=True) tree1 = Tree('''(S1 (INTJ (UH Oh) (JJ sure) (. !)))''') tree2 = Tree('''(S1 (FRAG (INTJ (UH Oh) (INTJ (JJ sure))) (. !)))''') print tree1.evaluate(tree2) print tree2.evaluate(tree1)
from multiprocessing import pool from bllipparser import RerankingParser import string def multi_phrase_parse(s, rrp): file_in = open('./reverb_out%s' % s) file_out = open('./phrase_out%s' % s) for line in file_in: sep_line = line.split('\t') sentence = sep_line[12] file_out.write(rrp.simple_parse(sentence) + '\n') file_out.close() def multi_phrase(arg): return multi_phrase_parse(*arg) if __name__ == '__main__': p = Pool() parameter = [] parse = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) suffix = string.lowercase[0:10] suffix = list(suffix) for ele in suffix: parameter.append((ele, parse)) p.map(multi_phrase, parameter)
# Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. if __name__ == "__main__": # needs to be run from the root of the repository from bllipparser import RerankingParser, Tree rrp = RerankingParser() rrp.load_parser_model("first-stage/DATA/EN", terms_only=True) tree1 = Tree("""(S1 (INTJ (UH Oh) (JJ sure) (. !)))""") tree2 = Tree("""(S1 (FRAG (INTJ (UH Oh) (INTJ (JJ sure))) (. !)))""") print tree1.evaluate(tree2) print tree2.evaluate(tree1)
import fileinput from bllipparser import RerankingParser, Tree if __name__ == '__main__': rrp = RerankingParser() parser = 'wsj/WSJ-PTB3/parser' rrp.load_parser_model(parser) for line in fileinput.input(): tokens = Tree(line).tokens() nbest = rrp.parse(tokens) print len(nbest) for tree in nbest: print tree.ptb_parse
def set_assertions_for_yesno_questions(data): rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) yesno = data.get_questions_of_type('yesno') for q in tqdm(yesno): q.assertion_pos = q2s(q.question, rrp)
def train_svm(kernel_type): trainDF = fex.read_data('/home/baseline_AC/train_AC_combined_models.csv') testDF = fex.read_data( '/home/baseline_AC/test_AC_combined_models_duplicate_included.csv') model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) #ctx = mx.gpu(0) #bert = BertEmbedding(ctx=ctx) #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #model = BertModel.from_pretrained('bert-base-uncased') #tokenizer = RobertaTokenizer.from_pretrained('roberta-base') #model = RobertaModel.from_pretrained('roberta-base') #tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') #model = XLNetModel.from_pretrained('xlnet-base-cased') model_path = '/home/AC_models_Argument_corpus/roberta/' model = ClassificationModel( 'roberta', model_path, num_labels=4, args={"config": { "output_hidden_states": True }}) #trainDF_x = fex.extract_features(trainDF,parser) #trainDF_x = fex.extract_features(trainDF, parser, tokenizer, model) trainDF_x = fex.extract_features(trainDF, parser, model) feature_train_x = fex.make_feature_vector(trainDF_x) #testDF_x = fex.extract_features(testDF,parser) #testDF_x = fex.extract_features(testDF, parser, tokenizer, model) testDF_x = fex.extract_features(testDF, parser, model) feature_test_x = fex.make_feature_vector(testDF_x) # label encode the target variable train_y = [] test_y = [] for index, row in trainDF.iterrows(): if row['label'] == 'Claim': train_y.append(1) elif row['label'] == 'Premise': train_y.append(0) elif row['label'] == 'MajorClaim': train_y.append(3) else: train_y.append(2) for index, row in testDF.iterrows(): if row['label'] == 'Claim': test_y.append(1) elif row['label'] == 'Premise': test_y.append(0) elif row['label'] == 'MajorClaim': test_y.append(3) else: test_y.append(2) #train_y = encoder.fit_transform(train_y) svmclassifier = svm.SVC(kernel=kernel_type) svmclassifier.fit(feature_train_x, train_y) filename = 'finalized_model_linear.sav' #joblib.dump(svmclassifier, filename) y_pred = svmclassifier.predict(feature_test_x) print("argument corpus results for test:") print(confusion_matrix(test_y, y_pred)) print(classification_report(test_y, y_pred)) print("two law set results for test:") testlawDF = fex.read_data( '/home/baseline_AC/test_judgement_AC_combined_models_duplicate_included.csv' ) #testlawDF_x = fex.extract_features(testlawDF,parser) #testlawDF_x = fex.extract_features(testlawDF, parser, tokenizer, model) testlawDF_x = fex.extract_features(testlawDF, parser, model) feature_test_law = fex.make_feature_vector(testlawDF_x) test_y_law = [] for index, row in testlawDF.iterrows(): if row['label'] == 'Claim': test_y_law.append(1) elif row['label'] == 'Premise': test_y_law.append(0) elif row['label'] == 'MajorClaim': test_y_law.append(3) else: test_y_law.append(2) y_pred_2 = svmclassifier.predict(feature_test_law) print(confusion_matrix(test_y_law, y_pred_2)) print(classification_report(test_y_law, y_pred_2)) filename = 'finalized_model_svm_roberta_finetuned_embedding.sav' joblib.dump(svmclassifier, filename)
from bllipparser import RerankingParser, tokenize print "start loading model..." rrp = RerankingParser.load_unified_model_dir('/home/yukang/selftrained') print "finish loading model" inputfile = "wsjtest" outputfile = "wsjtest.reparse" count = 0 data = open(inputfile) output = open(outputfile, 'w') sentence = [] for line in data: if len(line.split()) == 0: if len(sentence) == 0: continue count += 1 print "start solving", count #last line of the file must be a blank line to terminate the last sentence. l = [ word[0].replace("(", "-LRB-").replace(")", "-RRB-") for word in sentence ] ans = rrp.parse(l) output.write(str(ans[0].ptb_parse) + "\n") # if count > 1: # break sentence = [] else: parts = line.split() sentence.append(parts) output.close()
def main(sentence): model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) #{'language': 'En', 'case_insensitive': False, 'nbest': 5, 'small_corpus': True, 'overparsing': 21, 'debug': 0, 'smooth_pos': 0} parser.set_parser_options(case_insensitive=True) l = parser.parse(sentence) Trees = [1, 2] Trees[0] = l.get_reranker_best().ptb_parse Trees[1] = l.get_parser_best().ptb_parse synlist = [] try: for x in range(2): synlist += find_syn(Trees[x], 1)[1] except: a = 0 synlist = rem_dupl(synlist) synlist = list(map(lambda x: x[1], synlist)) #print(synlist) #done to split puncts separately for i in puncts: sentence = sentence.replace(i, ' ' + i + ' ') # sentence = sentence.replace('\n', ' ') WORDS = sentence.split() # WORDS = list(map(lambda x: x.lower(), WORDS)) #now WORDS = list of puncts and lower-cased words in arg text #print(WORDS) Dict = {} it = 1 for w in synlist: while (WORDS[it - 1] != w): Dict[WORDS[it - 1]] = [] it = it + 1 if w in puncts: continue if w.lower() in iitb_lingo: print([it, w, [iitb_lingo[w.lower()]]]) else: synonyms = [] q = "https://api.datamuse.com/words?ml=" + w #building trigram such that adjacent words shouldn't be iitb lingo or punctuation if it > 1 and not (WORDS[it - 2] in iitb_lingo) and not ( WORDS[it - 2] in puncts): q = q + '&lc=' + WORDS[it - 2] if it < len(WORDS) and not (WORDS[it] in iitb_lingo) and not ( WORDS[it] in puncts): q = q + '&rc=' + WORDS[it] response = requests.get(q) l = response.json() for i in l: synonyms.append(i["word"]) #phrase finder # freq = [] # for i in synonyms: # phrase = i # if w > 0 and not(WORDS[w-1] in iitb_lingo) and not(WORDS[w-1] in puncts): # phrase = WORDS[w-1] + ' ' + phrase # if w < len(WORDS)-1 and not(WORDS[w+1] in iitb_lingo) and not(WORDS[w+1] in puncts): # phrase = phrase + ' ' + WORDS[w+1] # encoded_query = urllib.parse.quote(phrase) # params = {'corpus': 'eng-gb', 'query': encoded_query} # params = '&'.join('{}={}'.format(name, value) for name, value in params.items()) # response = requests.get('https://api.phrasefinder.io/search?' + params) # assert response.status_code == 200 # if len(response.json()["phrases"]) > 0: # freq.append(response.json()["phrases"][0]["mc"]) # else: # freq.append(0) # zipped = list(zip(synonyms, freq)) # zipped = sorted(zipped, key = lambda x: x[1], reverse = True) # res = [] # for i in range(min(3,len(zipped))): # res.append(zipped[i][0]) # print(res) #top 5 synonyms Dict[WORDS[it - 1]] = synonyms[:3] print([it, w, synonyms[:3]]) it = it + 1 return Dict
""" Create the Semantic Representation """ sentenceList = [] # a list for the SemanticRepresentation objects workPath = os.getcwd() dependencyInputFile = workPath+'/senna/input.txt' with open(dependencyInputFile) as f: dlines = f.readlines() #Load model to parse PENN TreeBank print 'Loading parsing model...' # only for the first run (uncomment the following line): # rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True) # when it is not the first run: rrp = RerankingParser.from_unified_model_dir('/Users/evania/.local/share/bllipparser/WSJ-PTB3') # Load model to parse PENN TreeBank - is finished #Now try to parse the text: print 'Parsing the dependency for the sentence(s)...' len_dlines = len(dlines) count_dlines = 1 for l in dlines: if l != '\n': # if not an empty line theDependencyResult = getDependency(l) theID = 0 semList = [] for token in theDependencyResult: #print token stringToken = str(token) sem = SemanticRepresentation()
return [] warnings.filterwarnings('ignore') rel_summary_all_doc = np.load("/home/yld8809/all_rel/tp_all_train.npy") raw_doc_folder = "/home/yld8809/all_rel/txt_all_train/" rel_summary_all_doc_test = np.load("/home/yld8809/all_rel/tp_all_test.npy") raw_doc_folder_test = "/home/yld8809/all_rel/txt_all_test/" model = gensim.models.KeyedVectors.load_word2vec_format( "/home/yld8809/semrel/mimic3_pp300.txt", binary=False) model_size = 300 rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) raw_ind = glob.glob(raw_doc_folder + '/*.txt') raw_ind.sort() raw_ind_test = glob.glob(raw_doc_folder_test + '/*.txt') raw_ind_test.sort() num_doc = len(raw_ind) num_doc_test = len(raw_ind_test) word_embedding_all = np.empty(shape=[0, model_size + 7]) dep_mat_all = np.empty(shape=[0, 0]) de_parse_last = [] last_sent = []
def __init__(self): self.rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True) self.sd = StanfordDependencies.get_instance(backend='subprocess')
def __init__(self, biomodel): self.parser = RerankingParser.from_unified_model_dir(biomodel.encode('utf-8'))
from bllipparser import RerankingParser as rrp from nltk.parse.api import ParserI from nltk.tree import Tree from nltk.data import find model_dir = find('models/bllip_wsj_no_aux').path bllip = rrp.from_unified_model_dir(model_dir) f = open("../Fragments_for_testing/text2", "r") sentence = f.read() all_parses = bllip.parse(sentence) ptb = all_parses[0].ptb_parse tree = Tree.fromstring(str(ptb)) tree.draw()
def __init__(self): super(PatternLearner, self).__init__() self.rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
break if good: new_nbest.append(t) return new_nbest if __name__ == '__main__': if len(sys.argv) != 3 and len(sys.argv) != 4: print 'usage: python traversal.py vocab.gz gold.gz [nbest.gz]' sys.exit(0) words = read_vocab(sys.argv[1]) if len(sys.argv) == 3: for line in open_file(sys.argv[2]): print ptb(line[:-1], words) else: rrp = RerankingParser() parser = 'wsj/WSJ-PTB3/parser' rrp.load_parser_model(parser) for gold, nbest in zip(open_file(sys.argv[2]), generate_nbest(open_file(sys.argv[3]))): for tree in nbest: tree['seq'] = ptb(tree['ptb'], words) nbest = remove_duplicates(nbest) gold = Tree(gold) print len(nbest) for t in nbest: scores = Tree(t['ptb']).evaluate(gold) print scores['gold'], scores['test'], scores['matched'] print t['seq']