Ejemplo n.º 1
0
 def run(self):
     global rrp
     print "Reloading"
     rrp = RerankingParser()
     rrp.load_parser_model(os.path.join(os.path.dirname(__file__), '../../lib/bllip/DATA/EN'))
     rrp.load_reranker_model(os.path.join(os.path.dirname(__file__), '../../lib/bllip/models/ec50spfinal/features.gz'), os.path.join(os.path.dirname(__file__), '../../lib/bllip/models/ec50spfinal/cvlm-l1c10P1-weights.gz'))
     print "Done loading model"
Ejemplo n.º 2
0
 def test_parse(self):
     path = bllip_wrapper.init_model()
     rrp = RerankingParser.from_unified_model_dir(path)
     tree = bllip_wrapper.parse(rrp, 'hello world!')
     self.assertIsNotNone(tree)
     print tree.ptb_parse
     self.assertEqual(str(tree.ptb_parse), '(S1 (S (NP (NN hello) (NN world) (NN !))))')
Ejemplo n.º 3
0
    def parse(self,sent_filename):
        """
        use Charniak parser to parse sentences then convert results to Stanford Dependency
        """
        from bllipparser.ModelFetcher import download_and_install_model
        from bllipparser import RerankingParser
        #path_to_model = './bllip-parser/models/WSJ+Gigaword'
        #if not.path.exists(path_to_model):
        model_type = 'WSJ+Gigaword'
        path_to_model = download_and_install_model(model_type,'./bllip-parser/models')
        print "Loading Charniak parser model: %s ..." % (model_type)
        rrp = RerankingParser.from_unified_model_dir(path_to_model)
        print "Begin Charniak parsing ..."
        parsed_filename = sent_filename+'.charniak.parse'
        parsed_trees = ''
        with open(sent_filename,'r') as f:
            for l in f:
                parsed_trees += rrp.simple_parse(l.strip().split())
                parsed_trees += '\n'

        with open(parsed_filename,'w') as of:
            of.write(parsed_trees)
                

        # convert parse tree to dependency tree
        print "Convert Charniak parse tree to Stanford Dependency tree ..."
        subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
Ejemplo n.º 4
0
    def load_biomodel(self):
        rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
        for did in self.documents:
            for sentence in self.documents[did].sentences:
                sentence_text = [t.text for t in sentence.tokens]
                #echocall = Popen(["echo", sentence_text] , stdout=PIPE, stderr=PIPE)
                #nc_params = ["nc", "localhost", "4449"]
                #echocall.wait()
                #call = check_output(nc_params , shell=True, stdin=echocall.stdout)

                #res = call.communicate()
                #res = netcat("localhost", 4449, sentence_text)
                #print res.strip()
                #print
                res = rrp.parse(sentence_text)
                if len(res) > 0:
                    print res[0].ptb_parse
                    print sentence.parsetree
                    print
                    #print
                    sentence.bio_parse = str(res[0].ptb_parse)
                else:
                    print sentence_text
                    print "no parse"
                    sentence.bio_parse = sentence.parsetree
                    print
Ejemplo n.º 5
0
 def __init__(self):
     if CharniakParser.parser is None:
         from bllipparser.ModelFetcher import download_and_install_model
         from bllipparser import RerankingParser
         model_type = 'WSJ+Gigaword'
         path_to_model = download_and_install_model(model_type,'./bllip-parser/models')
         print "Loading Charniak parser model: %s ..." % (model_type)
         CharniakParser.parser = RerankingParser.from_unified_model_dir(path_to_model)
Ejemplo n.º 6
0
 def __init__(self, nbest=10, overparsing=10, only_parse=False, stop_words=None):
     self.parser = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=False)
     """create a Reranking Parser from BllipParser"""
     self.parser.set_parser_options(nbest=nbest, overparsing=overparsing)
     """set parser options"""
     self.only_parse=only_parse
     """whether features should be used from the BllipParser"""
     self.stemmer = LancasterStemmer()
     """an instance of LancasterStemmer from NLTK"""
     self.stop_words = stop_words
     if self.stop_words is None:
         self.stop_words = stopwords.words('english')
Ejemplo n.º 7
0
def features(docList):
    import time

    
    # download model (only needs to be done once)
    model_dir = download_and_install_model('WSJ', '/tmp/models')
    # Loading the model is slow, but only needs to be done once
    rrp = RerankingParser.from_unified_model_dir(model_dir)
    rrp.set_parser_options(nbest = 5)
    features = []
    scores = []
    with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w")  as synFile, open("syn_scores.pkl", "w")  as scoresFile:

        for i, doc in enumerate(docList):
            start_time = time.time()

            features.append(defaultdict(float))
            scores.append(defaultdict(list))

            for sentence in doc:
                
                parses = rrp.parse(sentence, rerank=False)
                #print(len(parses))
                #print(sentence, file = logF)
                try:
                    parse_score = parses[0].parser_score
                    rerank_score = parses[0].reranker_score
                    scores[i]['parse'].append(parse_score)
                    scores[i]['rerank'].append(rerank_score)
                    scores[i]['sent_length'].append(len(parses[0].ptb_parse.tokens()))
    
                    best_parse = parses[0].ptb_parse
                    # print(best_parse, file = logF)
                
                    for t in best_parse.all_subtrees():
                        levels = buildSubtrees(t)
                        for l in levels:
                            features[i][l] += 1.0
                except:
                    print("No parse available - skipping")
            features[i] = {x:v for x,v in features[i].items()}
            print("{0}".format(sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file = logF)
            print("--- {0} seconds for {1} sentences ---" .format(time.time() - start_time, len(doc)))

        pickle.dump(features, synFile)
        pickle.dump(scores, scoresFile)


#     t_bllip = Timer(lambda: rrp.parse(sentence))
#     print ("bllip", t_bllip.timeit(number=5))
    
    pass
Ejemplo n.º 8
0
    def __init__(
        self,
        parser_model=None,
        reranker_features=None,
        reranker_weights=None,
        parser_options=None,
        reranker_options=None,
    ):
        """
        Load a BLLIP Parser model from scratch. You'll typically want to
        use the ``from_unified_model_dir()`` class method to construct
        this object.

        :param parser_model: Path to parser model directory
        :type parser_model: str

        :param reranker_features: Path the reranker model's features file
        :type reranker_features: str

        :param reranker_weights: Path the reranker model's weights file
        :type reranker_weights: str

        :param parser_options: optional dictionary of parser options, see
        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
        for more information.
        :type parser_options: dict(str)

        :param reranker_options: optional
        dictionary of reranker options, see
        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
        for more information.
        :type reranker_options: dict(str)
        """
        _ensure_bllip_import_or_error()

        parser_options = parser_options or {}
        reranker_options = reranker_options or {}

        self.rrp = RerankingParser()
        self.rrp.load_parser_model(parser_model, **parser_options)
        if reranker_features and reranker_weights:
            self.rrp.load_reranker_model(
                features_filename=reranker_features,
                weights_filename=reranker_weights,
                **reranker_options
            )
Ejemplo n.º 9
0
class BllipParser(ParserI):
    """
    Interface for parsing with BLLIP Parser. BllipParser objects can be
    constructed with the ``BllipParser.from_unified_model_dir`` class
    method or manually using the ``BllipParser`` constructor.
    """

    def __init__(
        self,
        parser_model=None,
        reranker_features=None,
        reranker_weights=None,
        parser_options=None,
        reranker_options=None,
    ):
        """
        Load a BLLIP Parser model from scratch. You'll typically want to
        use the ``from_unified_model_dir()`` class method to construct
        this object.

        :param parser_model: Path to parser model directory
        :type parser_model: str

        :param reranker_features: Path the reranker model's features file
        :type reranker_features: str

        :param reranker_weights: Path the reranker model's weights file
        :type reranker_weights: str

        :param parser_options: optional dictionary of parser options, see
        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
        for more information.
        :type parser_options: dict(str)

        :param reranker_options: optional
        dictionary of reranker options, see
        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
        for more information.
        :type reranker_options: dict(str)
        """
        _ensure_bllip_import_or_error()

        parser_options = parser_options or {}
        reranker_options = reranker_options or {}

        self.rrp = RerankingParser()
        self.rrp.load_parser_model(parser_model, **parser_options)
        if reranker_features and reranker_weights:
            self.rrp.load_reranker_model(
                features_filename=reranker_features,
                weights_filename=reranker_weights,
                **reranker_options
            )

    def parse(self, sentence):
        """
        Use BLLIP Parser to parse a sentence. Takes a sentence as a list
        of words; it will be automatically tagged with this BLLIP Parser
        instance's tagger.

        :return: An iterator that generates parse trees for the sentence
        from most likely to least likely.

        :param sentence: The sentence to be parsed
        :type sentence: list(str)
        :rtype: iter(Tree)
        """
        _ensure_ascii(sentence)
        nbest_list = self.rrp.parse(sentence)
        for scored_parse in nbest_list:
            yield _scored_parse_to_nltk_tree(scored_parse)

    def tagged_parse(self, word_and_tag_pairs):
        """
        Use BLLIP to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized
        and tagged. BLLIP will attempt to use the tags provided but may
        use others if it can't come up with a complete parse subject
        to those constraints. You may also specify a tag as ``None``
        to leave a token's tag unconstrained.

        :return: An iterator that generates parse trees for the sentence
        from most likely to least likely.

        :param sentence: Input sentence to parse as (word, tag) pairs
        :type sentence: list(tuple(str, str))
        :rtype: iter(Tree)
        """
        words = []
        tag_map = {}
        for i, (word, tag) in enumerate(word_and_tag_pairs):
            words.append(word)
            if tag is not None:
                tag_map[i] = tag

        _ensure_ascii(words)
        nbest_list = self.rrp.parse_tagged(words, tag_map)
        for scored_parse in nbest_list:
            yield _scored_parse_to_nltk_tree(scored_parse)

    @classmethod
    def from_unified_model_dir(
        cls, model_dir, parser_options=None, reranker_options=None
    ):
        """
        Create a ``BllipParser`` object from a unified parsing model
        directory. Unified parsing model directories are a standardized
        way of storing BLLIP parser and reranker models together on disk.
        See ``bllipparser.RerankingParser.get_unified_model_parameters()``
        for more information about unified model directories.

        :return: A ``BllipParser`` object using the parser and reranker
        models in the model directory.

        :param model_dir: Path to the unified model directory.
        :type model_dir: str
        :param parser_options: optional dictionary of parser options, see
        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
        for more information.
        :type parser_options: dict(str)
        :param reranker_options: optional dictionary of reranker options, see
        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
        for more information.
        :type reranker_options: dict(str)
        :rtype: BllipParser
        """
        (
            parser_model_dir,
            reranker_features_filename,
            reranker_weights_filename,
        ) = get_unified_model_parameters(model_dir)
        return cls(
            parser_model_dir,
            reranker_features_filename,
            reranker_weights_filename,
            parser_options,
            reranker_options,
        )
Ejemplo n.º 10
0
class BllipParser(ParserI):
    """
    Interface for parsing with BLLIP Parser. BllipParser objects can be
    constructed with the ``BllipParser.from_unified_model_dir`` class
    method or manually using the ``BllipParser`` constructor.
    """
    def __init__(
        self,
        parser_model=None,
        reranker_features=None,
        reranker_weights=None,
        parser_options=None,
        reranker_options=None,
    ):
        """
        Load a BLLIP Parser model from scratch. You'll typically want to
        use the ``from_unified_model_dir()`` class method to construct
        this object.

        :param parser_model: Path to parser model directory
        :type parser_model: str

        :param reranker_features: Path the reranker model's features file
        :type reranker_features: str

        :param reranker_weights: Path the reranker model's weights file
        :type reranker_weights: str

        :param parser_options: optional dictionary of parser options, see
        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
        for more information.
        :type parser_options: dict(str)

        :param reranker_options: optional
        dictionary of reranker options, see
        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
        for more information.
        :type reranker_options: dict(str)
        """
        _ensure_bllip_import_or_error()

        parser_options = parser_options or {}
        reranker_options = reranker_options or {}

        self.rrp = RerankingParser()
        self.rrp.load_parser_model(parser_model, **parser_options)
        if reranker_features and reranker_weights:
            self.rrp.load_reranker_model(features_filename=reranker_features,
                                         weights_filename=reranker_weights,
                                         **reranker_options)

    def parse(self, sentence):
        """
        Use BLLIP Parser to parse a sentence. Takes a sentence as a list
        of words; it will be automatically tagged with this BLLIP Parser
        instance's tagger.

        :return: An iterator that generates parse trees for the sentence
        from most likely to least likely.

        :param sentence: The sentence to be parsed
        :type sentence: list(str)
        :rtype: iter(Tree)
        """
        _ensure_ascii(sentence)
        nbest_list = self.rrp.parse(sentence)
        for scored_parse in nbest_list:
            yield _scored_parse_to_nltk_tree(scored_parse)

    def tagged_parse(self, word_and_tag_pairs):
        """
        Use BLLIP to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized
        and tagged. BLLIP will attempt to use the tags provided but may
        use others if it can't come up with a complete parse subject
        to those constraints. You may also specify a tag as ``None``
        to leave a token's tag unconstrained.

        :return: An iterator that generates parse trees for the sentence
        from most likely to least likely.

        :param sentence: Input sentence to parse as (word, tag) pairs
        :type sentence: list(tuple(str, str))
        :rtype: iter(Tree)
        """
        words = []
        tag_map = {}
        for i, (word, tag) in enumerate(word_and_tag_pairs):
            words.append(word)
            if tag is not None:
                tag_map[i] = tag

        _ensure_ascii(words)
        nbest_list = self.rrp.parse_tagged(words, tag_map)
        for scored_parse in nbest_list:
            yield _scored_parse_to_nltk_tree(scored_parse)

    @classmethod
    def from_unified_model_dir(cls,
                               model_dir,
                               parser_options=None,
                               reranker_options=None):
        """
        Create a ``BllipParser`` object from a unified parsing model
        directory. Unified parsing model directories are a standardized
        way of storing BLLIP parser and reranker models together on disk.
        See ``bllipparser.RerankingParser.get_unified_model_parameters()``
        for more information about unified model directories.

        :return: A ``BllipParser`` object using the parser and reranker
        models in the model directory.

        :param model_dir: Path to the unified model directory.
        :type model_dir: str
        :param parser_options: optional dictionary of parser options, see
        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
        for more information.
        :type parser_options: dict(str)
        :param reranker_options: optional dictionary of reranker options, see
        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
        for more information.
        :type reranker_options: dict(str)
        :rtype: BllipParser
        """
        (
            parser_model_dir,
            reranker_features_filename,
            reranker_weights_filename,
        ) = get_unified_model_parameters(model_dir)
        return cls(
            parser_model_dir,
            reranker_features_filename,
            reranker_weights_filename,
            parser_options,
            reranker_options,
        )
Ejemplo n.º 11
0
import bllipparser
from bllipparser import RerankingParser
from nltk import Tree
from practnlptools.tools import Annotator

score = 0


annotator = Annotator()

rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)

query = "Describe steps taken and worldwide reaction prior to introduction of the Euro on January 1, 1999."
candidate = "Europe's new currency, the euro, will rival the U.S. dollar as an international currency over the long term, Der Speigel magazine reported Sunday."

qListOfDict = annotator.getAnnotations(query)['srl']
cListOfDict = annotator.getAnnotations(candidate)['srl']


qParsed = ['(S1 ']
cParsed = ['(S1 ']

for list in qListOfDict:
	






Ejemplo n.º 12
0
    parser.add_argument("-v", "--verbose", action="store_true", help="print debug information")
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    if not os.path.isfile(args.input):
        sys.stderr.write('Cannot find input file: %s\n' % args.input)
        sys.exit(2)

    logging.info('Input file: %s' % args.input)
    logging.info('Output file: %s' % args.output)
    return args.input, args.output


if __name__ == "__main__":
    inputfilename, outputfilename = parse_argv()

    model_dir = init_model()
    logging.info('loading model %s ...' % model_dir)
    rrp = RerankingParser.from_unified_model_dir(model_dir)

    collection = parse(inputfilename)
    collection.clear_infons()
    collection.infons['tool'] = 'Bllip'
    collection.infons['process'] = 'parse'

    parse_bioc(rrp, collection)

    collection.tobiocfile(outputfilename)
Ejemplo n.º 13
0
    except:
        strJsonObj = '{}'
        dictTotal = {}
        traceback.print_exc()
    return dictTotal


fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopMixVersion = fopRoot + 'step4_mixCode/'
fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
fpPseudocodeBeforePOS = fopRoot + 'pseudocode_before_pos.txt'
fpCachedFilePath = fopRoot + 'cachedFilePaths.txt'
createDirIfNotExist(fopMixVersion)

model_dir = find('models/bllip_wsj_no_aux').path
parser = RerankingParser.from_unified_model_dir(model_dir)

print('before traverse')
lstFpJsonFiles = []
if not os.path.isfile(fpCachedFilePath):
    lstFop1 = sorted(glob.glob(fopMixVersion + '*/'))
    for fop1 in lstFop1:
        lstFop2 = sorted(glob.glob(fop1 + '*/'))
        for fop2 in lstFop2:
            lstFop3 = sorted(glob.glob(fop2 + 'v_*_label.txt'))
            # print(fp3)
            for fp3 in lstFop3:
                lstFpJsonFiles.append(fp3)
        print('end {}'.format(fop1))
    # sorted(glob.glob(fopMixVersion+'**/**/a_json.txt'))
    print('after {} '.format(len(lstFpJsonFiles)))
Ejemplo n.º 14
0
	print 'e.g: python parse.py paraphrases.csv 0 bllip'	
	sys.exit(0)

if sys.argv[3] == 'bllip':
	# bllip
	parser = '/pro/dpg/dc65/models/WSJ+QB'
	print 'basic:', parser
elif sys.argv[3] == 'self':
	# self-trained
	parser = '/pro/dpg/dc65/models/WSJ+Gigaword'
	print 'self-trained:', parser
else:
	print 'parser options: bllip, self'
	sys.exit(0)

rrp = RerankingParser()
rrp.load_parser_model(parser + '/parser')
print 'reranker: /pro/dpg/dc65/models/WSJ/'
rrp.load_reranker_model('/pro/dpg/dc65/models/WSJ/reranker/features.gz', '/pro/dpg/dc65/models/WSJ/reranker/weights.gz')
	
mode = int(sys.argv[2]) # 0: gold, 1: 1best, 2: nbest
f = open('tmp/trees', 'w')
if mode == 2:
	g = open('tmp/scores', 'w')
with open(sys.argv[1], 'rb') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	iter = 0
	for row in reader:
		iter += 1
		if iter % 3 == 1:
			if mode == 0:
Ejemplo n.º 15
0
from bllipparser import RerankingParser, tokenize
print "start loading model..."
rrp = RerankingParser.load_unified_model_dir('/home/yukang/selftrained')
print "finish loading model"
inputfile = "wsjtest"
outputfile = "wsjtest.reparse"
count = 0
data = open(inputfile)
output = open(outputfile, 'w')
sentence = []
for line in data:
	if len(line.split())==0:
		if len(sentence)==0:
			continue
		count+=1
		print "start solving", count
		#last line of the file must be a blank line to terminate the last sentence.
		l = [word[0].replace("(", "-LRB-").replace(")", "-RRB-") for word in sentence]
		ans = rrp.parse(l)
		output.write(str(ans[0].ptb_parse)+"\n")
		# if count > 1:
			# break
		sentence = []
	else:
		parts = line.split()
		sentence.append(parts)
output.close()
Ejemplo n.º 16
0
import copy
import re
from bllipparser import RerankingParser
import itertools
import urllib
import pydot
import os
from bllipparser.ModelFetcher import download_and_install_model
import re

if not os.path.exists(  os.path.join( os.getcwd(), "bllip", "models", "WSJ")  ):
	print "Downloading the BLLIP model ... "
	download_and_install_model('WSJ', os.path.join( os.getcwd(), "bllip", "models") )
	print "Done Downloading."

rrp = RerankingParser.from_unified_model_dir('bllip/models/WSJ')


def get_svg(data):
	graphs = pydot.graph_from_dot_data( data )
	svg_string = graphs[0].create_svg()
	return svg_string

def get_fsm_code(list_of_sentences):
	global rrp
	list_of_sentences = map( lambda sentence: (str(sentence)).lower(), list_of_sentences)
	list_of_sentences = map( lambda sentence:  re.sub(r'\..*', "", sentence ), list_of_sentences)
	list_of_parsed_strings = map( lambda sentence: rrp.simple_parse(sentence) , list_of_sentences)
	list_of_codified_parse_strings = map( lambda parse_string: ParseForest.codify_parse_string(parse_string) , list_of_parsed_strings)
	list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string),  list_of_codified_parse_strings)
	# list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string),  list_of_parsed_strings)
Ejemplo n.º 17
0
            mapping[subtree.span()] = subtree.label
        for span, allowed_labels in constraints.items():
            if mapping.get(span) not in allowed_labels:
                return False
        return True

    nbest_list = rrp.parse(sentence)
    for item in nbest_list:
        if consistent(item.ptb_parse, constraints):
            return item.ptb_parse
    else:
        return None

if __name__ == "__main__":
    # this needs to be run from the root of the repository since it has
    # a relative path to the parsing model

    from bllipparser import RerankingParser
    rrp = RerankingParser()
    rrp.load_parser_model('first-stage/DATA/EN')

    # the constraint means: there must be a VP from [1,5)
    # (i.e., left ... Falklands)
    # this encourages the parser to pick "left" as the main verb
    constraints = {(1, 5): ['VP']}
    print parse_constrained(rrp, 'British left waffles on Falklands .'.split(),
                            constraints)
    # if we parse without constraints, we get that the main verb is "waffles"
    print parse_constrained(rrp, 'British left waffles on Falklands .'.split(),
                            {})
Ejemplo n.º 18
0
from bllipparser import RerankingParser
RerankingParser.fetch_and_load('GENIA+PubMed')
Ejemplo n.º 19
0
        for span, allowed_labels in constraints.items():
            if mapping.get(span) not in allowed_labels:
                return False
        return True

    nbest_list = rrp.parse(sentence)
    for item in nbest_list:
        if consistent(item.ptb_parse, constraints):
            return item.ptb_parse
    else:
        return None


if __name__ == "__main__":
    # this needs to be run from the root of the repository since it has
    # a relative path to the parsing model

    from bllipparser import RerankingParser
    rrp = RerankingParser()
    rrp.load_parser_model('first-stage/DATA/EN')

    # the constraint means: there must be a VP from [1,5)
    # (i.e., left ... Falklands)
    # this encourages the parser to pick "left" as the main verb
    constraints = {(1, 5): ['VP']}
    print parse_constrained(rrp, 'British left waffles on Falklands .'.split(),
                            constraints)
    # if we parse without constraints, we get that the main verb is "waffles"
    print parse_constrained(rrp, 'British left waffles on Falklands .'.split(),
                            {})
Ejemplo n.º 20
0
def main(transcript):

    # results = {"0": "1.0", "1": "0.9747",
    #            "2": "0.968", "3": "0.8859", "4": "0.7071"}
    # print(json.dumps(results))

    results = {}
    sentences = sent_tokenize(transcript)
    '''
        Declaration of constants and functions
    '''

    CONS_SATIRIC = 0
    CONS_RELIABLE = 1
    rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False)
    foo = TripletExtraction()
    bar = SemanticSimilarityAnalysis()
    '''
        2 database tables for comparison of input 
    '''
    c.execute('SELECT title FROM reliable_news')
    reliable_news = [tup[0] for tup in c.fetchall()]

    c.execute('SELECT title FROM satirical_news')
    satirical_news = [tup[0] for tup in c.fetchall()]

    t = len(sentences)
    correct_classifications = 0
    for i in range(t):
        max_similarity = 0
        classification = -1
        max_sentence = ""

        inp = sentences[i]
        ''' 
            generates the tree and gets the SVO of the input sentence
        '''
        tree_inp = Tree(rrp.simple_parse(inp))
        svo_inp = foo.getSVO(tree_inp[0])
        '''
            comparison for satirical and reliable news
        '''
        for title in satirical_news:
            for subj in svo_inp['subject']:
                if subj[2] == 0:
                    continue
                words = [x.lower() for x in sentence_tokenizer.tokenize(title)]
                if subj[0] in words or singularize(subj[0]) in words:
                    tree_data = Tree(rrp.simple_parse(title))
                    svo_data = foo.getSVO(tree_data[0])

                    similarity_score1 = bar.get_similarities(svo_inp, svo_data)
                    '''
                        object and subject swapped to provde more possible comparisons
                    '''
                    svo_data['subject'], svo_data['object'] = svo_data[
                        'object'], svo_data['subject']
                    similarity_score2 = bar.get_similarities(svo_inp, svo_data)

                    similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2
                    if similarity_score > max_similarity:
                        classification = 0
                        max_similarity = similarity_score
                        max_sentence = title
                    break

        for title in reliable_news:
            for sht in satiric_shits:
                title = title.replace(sht, "")
            for subj in svo_inp['subject']:
                if subj[2] == 0:
                    continue
                words = [x.lower() for x in sentence_tokenizer.tokenize(title)]
                if subj[0] in words or singularize(subj[0]) in words:
                    tree_data = Tree(rrp.simple_parse(title))
                    svo_data = foo.getSVO(tree_data[0])

                    similarity_score1 = bar.get_similarities(svo_inp, svo_data)
                    '''
                        object and subject swapped to provde more possible comparisons
                    '''
                    svo_data['subject'], svo_data['object'] = svo_data[
                        'object'], svo_data['subject']
                    similarity_score2 = bar.get_similarities(svo_inp, svo_data)

                    similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2
                    if similarity_score > max_similarity:
                        classification = 1
                        max_similarity = similarity_score
                        max_sentence = title
                    break
        if classification == CONS_RELIABLE:
            results[str(i)] = str(round(max_similarity, 4))
        elif classification == CONS_SATIRIC:
            results[str(i)] = str(round(-max_similarity, 4))
        else:
            results[str(i)] = "0"
    print(json.dumps(results))
Ejemplo n.º 21
0
 def __init__(self, model="WSJ-PTB3"):
     super().__init__(RerankingParser.fetch_and_load(model, verbose=True))
Ejemplo n.º 22
0
from bllipparser import RerankingParser

from kasami import TreeScorer
from kasami.normalizers import bllip

# Loading WSJ-PTB3 treebank into bllip's RerankingParser
bllip_rrp = RerankingParser.fetch_and_load('WSJ-PTB3')
bllip_parse = lambda s: bllip.normalize_tree(bllip_rrp.parse(s)[0].ptb_parse)

tree = bllip_parse("I am a little teapot")
print(tree)
print(tree.format(depth=1))

for production in tree:
    print(str(production))

sentences = [
    "I am a little teapot", "Here is my handle", "Here is my spout",
    "When I get all steamed up I just shout tip me over and pour me out",
    "I am a very special pot", "It is true",
    "Here is an example of what I can do", "I can turn my handle into a spout",
    "Tip me over and pour me out"
]

teapot_grammar = TreeScorer.from_tree_bank(bllip_parse(s) for s in sentences)

teapot_grammar.score(bllip_parse("Here is a little teapot"))
teapot_grammar.score(bllip_parse("It is my handle"))
teapot_grammar.score(bllip_parse("I am a spout"))
teapot_grammar.score(bllip_parse("Your teapot is gay"))
teapot_grammar.score(bllip_parse("Your mom's teapot is asldasnldansldal"))
Ejemplo n.º 23
0
from bllipparser import RerankingParser

rrp = RerankingParser.from_unified_model_dir(
    '/home/kashefi/.local/share/bllipparser/WSJ-PTB3')
sentence = "In the 3rd level I would place my little brother in. because my little brother is a very greedy little bot he always wants something."

pcfg = rrp.simple_parse(sentence.split(' '))
pcfg = pcfg[4:len(pcfg) - 1]
print pcfg
'''
pcfg = rrp.simple_parse(sentence)
pcfg = pcfg[4:len(pcfg)-1]
print(pcfg)
'''
Ejemplo n.º 24
0
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and limitations
# under the License.

if __name__ == "__main__":
    # needs to be run from the root of the repository for the parser
    # model path below to work

    from bllipparser import RerankingParser, Tree

    rrp = RerankingParser()
    rrp.load_parser_model('first-stage/DATA/EN', heads_only=True)

    tree1 = Tree('''(S1 (SQ (VBZ Swears) (NP (PRP she)) (VP (VBD
    recognized) (NP (PRP$ his) (NN voice)) (, ,) (SBAR (IN that) (S
    (NP (NNP Tim)) (VP (VBD fired)))) (, ,) ('' ') (S (S (NP (PRP It))
    (VP (VBZ 's) (NP (PRP$ my) (NN money)))) (CC and) (S (NP (PRP I))
    (VP (VBP want) (S (NP (PRP it)) (VP (POS '))))))) (. !)))''')

    head = tree1.head()
    print 'head word of sentence:', head.token
    print 'head tree of sentence:', head
    print

    # print all syntactic dependencies
    for goveror, dependent in tree1.dependencies():
Ejemplo n.º 25
0
 def __enter__(self):
     self.bllip = RerankingParser.fetch_and_load(self.model_name,
                                                 verbose=True)
     return self
Ejemplo n.º 26
0
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.  You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and limitations
# under the License.

if __name__ == "__main__":
    # needs to be run from the root of the repository

    from bllipparser import RerankingParser, Tree

    rrp = RerankingParser()
    rrp.load_parser_model('first-stage/DATA/EN', terms_only=True)

    tree1 = Tree('''(S1 (INTJ (UH Oh) (JJ sure) (. !)))''')

    tree2 = Tree('''(S1 (FRAG (INTJ (UH Oh) (INTJ (JJ sure))) (. !)))''')

    print tree1.evaluate(tree2)
    print tree2.evaluate(tree1)
from multiprocessing import pool
from bllipparser import RerankingParser
import string

def multi_phrase_parse(s, rrp):
	file_in = open('./reverb_out%s' % s)
	file_out = open('./phrase_out%s' % s)
	for line in file_in:
		sep_line = line.split('\t')
		sentence = sep_line[12]
		file_out.write(rrp.simple_parse(sentence) + '\n')
	file_out.close()

def multi_phrase(arg):
	return multi_phrase_parse(*arg)

if __name__ == '__main__':
    p = Pool()
    parameter = []
    parse = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
    suffix = string.lowercase[0:10]
 	suffix = list(suffix)
 	for ele in suffix:
 		parameter.append((ele, parse))
    p.map(multi_phrase, parameter)
	
Ejemplo n.º 28
0
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.  You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and limitations
# under the License.

if __name__ == "__main__":
    # needs to be run from the root of the repository

    from bllipparser import RerankingParser, Tree

    rrp = RerankingParser()
    rrp.load_parser_model("first-stage/DATA/EN", terms_only=True)

    tree1 = Tree("""(S1 (INTJ (UH Oh) (JJ sure) (. !)))""")

    tree2 = Tree("""(S1 (FRAG (INTJ (UH Oh) (INTJ (JJ sure))) (. !)))""")

    print tree1.evaluate(tree2)
    print tree2.evaluate(tree1)
Ejemplo n.º 29
0
import fileinput
from bllipparser import RerankingParser, Tree

if __name__ == '__main__':
    rrp = RerankingParser()
    parser = 'wsj/WSJ-PTB3/parser'
    rrp.load_parser_model(parser)
    for line in fileinput.input():
        tokens = Tree(line).tokens()
        nbest = rrp.parse(tokens)
        print len(nbest)
        for tree in nbest:
            print tree.ptb_parse
Ejemplo n.º 30
0
def set_assertions_for_yesno_questions(data):
    rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
    yesno = data.get_questions_of_type('yesno')
    for q in tqdm(yesno):
        q.assertion_pos = q2s(q.question, rrp)
Ejemplo n.º 31
0
def train_svm(kernel_type):
    trainDF = fex.read_data('/home/baseline_AC/train_AC_combined_models.csv')
    testDF = fex.read_data(
        '/home/baseline_AC/test_AC_combined_models_duplicate_included.csv')
    model_dir = find('models/bllip_wsj_no_aux').path
    parser = RerankingParser.from_unified_model_dir(model_dir)

    #ctx = mx.gpu(0)
    #bert = BertEmbedding(ctx=ctx)

    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    #model = BertModel.from_pretrained('bert-base-uncased')

    #tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    #model = RobertaModel.from_pretrained('roberta-base')

    #tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    #model = XLNetModel.from_pretrained('xlnet-base-cased')

    model_path = '/home/AC_models_Argument_corpus/roberta/'
    model = ClassificationModel(
        'roberta',
        model_path,
        num_labels=4,
        args={"config": {
            "output_hidden_states": True
        }})

    #trainDF_x = fex.extract_features(trainDF,parser)
    #trainDF_x = fex.extract_features(trainDF, parser, tokenizer, model)
    trainDF_x = fex.extract_features(trainDF, parser, model)
    feature_train_x = fex.make_feature_vector(trainDF_x)

    #testDF_x = fex.extract_features(testDF,parser)
    #testDF_x = fex.extract_features(testDF, parser, tokenizer, model)
    testDF_x = fex.extract_features(testDF, parser, model)
    feature_test_x = fex.make_feature_vector(testDF_x)

    # label encode the target variable
    train_y = []
    test_y = []

    for index, row in trainDF.iterrows():
        if row['label'] == 'Claim':
            train_y.append(1)
        elif row['label'] == 'Premise':
            train_y.append(0)
        elif row['label'] == 'MajorClaim':
            train_y.append(3)
        else:
            train_y.append(2)

    for index, row in testDF.iterrows():
        if row['label'] == 'Claim':
            test_y.append(1)
        elif row['label'] == 'Premise':
            test_y.append(0)
        elif row['label'] == 'MajorClaim':
            test_y.append(3)
        else:
            test_y.append(2)

    #train_y = encoder.fit_transform(train_y)

    svmclassifier = svm.SVC(kernel=kernel_type)
    svmclassifier.fit(feature_train_x, train_y)
    filename = 'finalized_model_linear.sav'
    #joblib.dump(svmclassifier, filename)

    y_pred = svmclassifier.predict(feature_test_x)
    print("argument corpus results for test:")
    print(confusion_matrix(test_y, y_pred))
    print(classification_report(test_y, y_pred))

    print("two law set results for test:")
    testlawDF = fex.read_data(
        '/home/baseline_AC/test_judgement_AC_combined_models_duplicate_included.csv'
    )
    #testlawDF_x = fex.extract_features(testlawDF,parser)
    #testlawDF_x = fex.extract_features(testlawDF, parser, tokenizer, model)
    testlawDF_x = fex.extract_features(testlawDF, parser, model)
    feature_test_law = fex.make_feature_vector(testlawDF_x)

    test_y_law = []
    for index, row in testlawDF.iterrows():
        if row['label'] == 'Claim':
            test_y_law.append(1)
        elif row['label'] == 'Premise':
            test_y_law.append(0)
        elif row['label'] == 'MajorClaim':
            test_y_law.append(3)
        else:
            test_y_law.append(2)

    y_pred_2 = svmclassifier.predict(feature_test_law)
    print(confusion_matrix(test_y_law, y_pred_2))
    print(classification_report(test_y_law, y_pred_2))
    filename = 'finalized_model_svm_roberta_finetuned_embedding.sav'
    joblib.dump(svmclassifier, filename)
Ejemplo n.º 32
0
from bllipparser import RerankingParser, tokenize
print "start loading model..."
rrp = RerankingParser.load_unified_model_dir('/home/yukang/selftrained')
print "finish loading model"
inputfile = "wsjtest"
outputfile = "wsjtest.reparse"
count = 0
data = open(inputfile)
output = open(outputfile, 'w')
sentence = []
for line in data:
    if len(line.split()) == 0:
        if len(sentence) == 0:
            continue
        count += 1
        print "start solving", count
        #last line of the file must be a blank line to terminate the last sentence.
        l = [
            word[0].replace("(", "-LRB-").replace(")", "-RRB-")
            for word in sentence
        ]
        ans = rrp.parse(l)
        output.write(str(ans[0].ptb_parse) + "\n")
        # if count > 1:
        # break
        sentence = []
    else:
        parts = line.split()
        sentence.append(parts)
output.close()
Ejemplo n.º 33
0
def main(sentence):

    model_dir = find('models/bllip_wsj_no_aux').path
    parser = RerankingParser.from_unified_model_dir(model_dir)
    #{'language': 'En', 'case_insensitive': False, 'nbest': 5, 'small_corpus': True, 'overparsing': 21, 'debug': 0, 'smooth_pos': 0}
    parser.set_parser_options(case_insensitive=True)
    l = parser.parse(sentence)
    Trees = [1, 2]
    Trees[0] = l.get_reranker_best().ptb_parse
    Trees[1] = l.get_parser_best().ptb_parse

    synlist = []
    try:
        for x in range(2):
            synlist += find_syn(Trees[x], 1)[1]
    except:
        a = 0
    synlist = rem_dupl(synlist)
    synlist = list(map(lambda x: x[1], synlist))

    #print(synlist)

    #done to split puncts separately
    for i in puncts:
        sentence = sentence.replace(i, ' ' + i + ' ')
    # sentence = sentence.replace('\n', ' ')

    WORDS = sentence.split()
    # WORDS = list(map(lambda x: x.lower(), WORDS))

    #now WORDS = list of puncts and lower-cased words in arg text

    #print(WORDS)

    Dict = {}

    it = 1
    for w in synlist:

        while (WORDS[it - 1] != w):
            Dict[WORDS[it - 1]] = []
            it = it + 1

        if w in puncts:
            continue

        if w.lower() in iitb_lingo:
            print([it, w, [iitb_lingo[w.lower()]]])

        else:

            synonyms = []

            q = "https://api.datamuse.com/words?ml=" + w

            #building trigram such that adjacent words shouldn't be iitb lingo or punctuation
            if it > 1 and not (WORDS[it - 2] in iitb_lingo) and not (
                    WORDS[it - 2] in puncts):
                q = q + '&lc=' + WORDS[it - 2]
            if it < len(WORDS) and not (WORDS[it] in iitb_lingo) and not (
                    WORDS[it] in puncts):
                q = q + '&rc=' + WORDS[it]

            response = requests.get(q)
            l = response.json()

            for i in l:
                synonyms.append(i["word"])

            #phrase finder
            # freq = []

            # for i in synonyms:

            # 	phrase = i
            # 	if w > 0 and not(WORDS[w-1] in iitb_lingo) and not(WORDS[w-1] in puncts):
            # 		phrase = WORDS[w-1] + ' ' + phrase
            # 	if w < len(WORDS)-1 and not(WORDS[w+1] in iitb_lingo) and not(WORDS[w+1] in puncts):
            # 		phrase = phrase + ' ' + WORDS[w+1]

            # 	encoded_query = urllib.parse.quote(phrase)
            # 	params = {'corpus': 'eng-gb', 'query': encoded_query}
            # 	params = '&'.join('{}={}'.format(name, value) for name, value in params.items())

            # 	response = requests.get('https://api.phrasefinder.io/search?' + params)
            # 	assert response.status_code == 200

            # 	if len(response.json()["phrases"]) > 0:
            # 		freq.append(response.json()["phrases"][0]["mc"])
            # 	else:
            # 		freq.append(0)

            # zipped = list(zip(synonyms, freq))
            # zipped = sorted(zipped, key = lambda x: x[1], reverse = True)
            # res = []
            # for i in range(min(3,len(zipped))):
            # 	res.append(zipped[i][0])
            # print(res)

            #top 5 synonyms
            Dict[WORDS[it - 1]] = synonyms[:3]
            print([it, w, synonyms[:3]])
            it = it + 1

    return Dict
Ejemplo n.º 34
0
"""
Create the Semantic Representation
"""

sentenceList = [] # a list for the SemanticRepresentation objects

workPath = os.getcwd()
dependencyInputFile = workPath+'/senna/input.txt'
with open(dependencyInputFile) as f:
    dlines = f.readlines()
#Load model to parse PENN TreeBank
print 'Loading parsing model...'
# only for the first run (uncomment the following line):
# rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)
# when it is not the first run:
rrp = RerankingParser.from_unified_model_dir('/Users/evania/.local/share/bllipparser/WSJ-PTB3')
# Load model to parse PENN TreeBank - is finished
#Now try to parse the text:
print 'Parsing the dependency for the sentence(s)...'
len_dlines = len(dlines)
count_dlines = 1

for l in dlines:
    if l != '\n': # if not an empty line
        theDependencyResult = getDependency(l)
        theID = 0
        semList = []
        for token in theDependencyResult:
            #print token
            stringToken = str(token)
            sem = SemanticRepresentation()
Ejemplo n.º 35
0
    return []


warnings.filterwarnings('ignore')

rel_summary_all_doc = np.load("/home/yld8809/all_rel/tp_all_train.npy")
raw_doc_folder = "/home/yld8809/all_rel/txt_all_train/"

rel_summary_all_doc_test = np.load("/home/yld8809/all_rel/tp_all_test.npy")
raw_doc_folder_test = "/home/yld8809/all_rel/txt_all_test/"

model = gensim.models.KeyedVectors.load_word2vec_format(
    "/home/yld8809/semrel/mimic3_pp300.txt", binary=False)
model_size = 300

rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)

raw_ind = glob.glob(raw_doc_folder + '/*.txt')
raw_ind.sort()

raw_ind_test = glob.glob(raw_doc_folder_test + '/*.txt')
raw_ind_test.sort()

num_doc = len(raw_ind)
num_doc_test = len(raw_ind_test)

word_embedding_all = np.empty(shape=[0, model_size + 7])
dep_mat_all = np.empty(shape=[0, 0])
de_parse_last = []
last_sent = []
Ejemplo n.º 36
0
 def __init__(self):
     self.rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)
     self.sd = StanfordDependencies.get_instance(backend='subprocess')
Ejemplo n.º 37
0
 def __init__(self, biomodel):
     self.parser = RerankingParser.from_unified_model_dir(biomodel.encode('utf-8'))
Ejemplo n.º 38
0
from bllipparser import RerankingParser as rrp
from nltk.parse.api import ParserI
from nltk.tree import Tree
from nltk.data import find

model_dir = find('models/bllip_wsj_no_aux').path
bllip = rrp.from_unified_model_dir(model_dir)

f = open("../Fragments_for_testing/text2", "r")
sentence = f.read()
all_parses = bllip.parse(sentence)

ptb = all_parses[0].ptb_parse
tree = Tree.fromstring(str(ptb))
tree.draw()
	def __init__(self):
		super(PatternLearner, self).__init__()
		self.rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
Ejemplo n.º 40
0
                break
        if good:
            new_nbest.append(t)
    return new_nbest


if __name__ == '__main__':
    if len(sys.argv) != 3 and len(sys.argv) != 4:
        print 'usage: python traversal.py vocab.gz gold.gz [nbest.gz]'
        sys.exit(0)

    words = read_vocab(sys.argv[1])
    if len(sys.argv) == 3:
        for line in open_file(sys.argv[2]):
            print ptb(line[:-1], words)
    else:
        rrp = RerankingParser()
        parser = 'wsj/WSJ-PTB3/parser'
        rrp.load_parser_model(parser)
        for gold, nbest in zip(open_file(sys.argv[2]),
                               generate_nbest(open_file(sys.argv[3]))):
            for tree in nbest:
                tree['seq'] = ptb(tree['ptb'], words)
            nbest = remove_duplicates(nbest)
            gold = Tree(gold)
            print len(nbest)
            for t in nbest:
                scores = Tree(t['ptb']).evaluate(gold)
                print scores['gold'], scores['test'], scores['matched']
                print t['seq']