Exemple #1
0
def test_parse():
    sents = {'en': 'She enjoys playing tennis.',
             'zh': '她喜欢打网球.',
             'de': 'Sie spielt gerne Tennis.',
             'fr': 'Elle aime jouer au tennis.',
             'ru': 'Она любит играть в теннис.',
             'he': 'היא נהנית לשחק טניס.'}
    tokenized_sents = {'en': ['She', 'enjoys', 'playing', 'tennis', '.'],
                       'zh': ['她', '喜欢', '打', '网球', '.'],
                       'de': ['Sie', 'spielt', 'gerne', 'Tennis', '.'],
                       'fr': ['Elle', 'aime', 'jouer', 'au', 'tennis', '.'],
                       'ru': ['Она', 'любит', 'играть', 'в', 'теннис', '.'],
                       'he': ['היא', 'נהנית', 'לשחק', 'טניס', '.']}
    for name, model in supar.NAME.items():
        if 'xlmr' in name or 'roberta' in name or 'electra' in name:
            continue
        parser = Parser.load(name, reload=True)
        if name.endswith(('en', 'zh')):
            lang = name[-2:]
            parser.predict(sents[lang], prob=True, lang=lang)
            parser.predict(tokenized_sents[lang], prob=True, lang=None)
        else:
            for lang in sents:
                parser.predict(sents[lang], prob=True, lang=lang)
            parser.predict(list(tokenized_sents.values()), prob=True, lang=None)
        os.remove(os.path.join(os.path.expanduser('~/.cache/supar'), model))
Exemple #2
0
def test_bert():
    nltk.download('punkt')
    sentence = nltk.word_tokenize('''
        No, it wasn't Black Monday.
        But while the New York Stock Exchange didn't fall apart Friday as the Dow Jones Industrial Average
        plunged 190.58 points - most of it in the final hour - it barely managed to stay this side of chaos.
        Some "circuit breakers" installed after the October 1987 crash failed their first test, traders say,
        unable to cool the selling panic in both stocks and futures.
        The 49 stock specialist firms on the Big Board floor - the buyers and sellers of last resort
        who were criticized after the 1987 crash - once again couldn't handle the selling pressure.
        Big investment banks refused to step up to the plate to support the beleaguered floor traders
        by buying big blocks of stock, traders say.
        Heavy selling of Standard & Poor's 500-stock index futures in Chicago relentlessly beat stocks downward.
        Seven Big Board stocks - UAL, AMR, BankAmerica, Walt Disney, Capital Cities/ABC,
        Philip Morris and Pacific Telesis Group - stopped trading and never resumed.
        The finger-pointing has already begun. "The equity market was illiquid.
        Once again {the specialists} were not able to handle the imbalances on the floor of the New York Stock Exchange,"
        said Christopher Pedersen, senior vice president at Twenty-First Securities Corp.
        Countered James Maguire, chairman of specialists Henderson Brothers Inc.:
        "It is easy to say the specialist isn't doing his job.
        When the dollar is in a free-fall, even central banks can't stop it.
        Speculators are calling for a degree of liquidity that is not there in the market."
        Many money managers and some traders had already left their offices early Friday afternoon on a warm autumn day -
        because the stock market was so quiet.
        Then in a lightning plunge,
        the Dow Jones industrials in barely an hour surrendered about a third of their gains this year,
        chalking up a 190.58-point, or 6.9%, loss on the day in gargantuan trading volume.
        Final-hour trading accelerated to 108.1 million shares, a record for the Big Board.
        At the end of the day, 251.2 million shares were traded.
        The Dow Jones industrials closed at 2569.26.
        The Dow's decline was second in point terms only to the 508-point Black Monday crash that occurred Oct. 19, 1987.
        ''')
    parser = Parser.load('biaffine-dep-bert-en')
    parser.predict([sentence], prob=True)
Exemple #3
0
 def __init__(self,bert,danku,vocab):
   from supar import Parser
   self.bert=bert
   self.vocab=vocab
   self.simplify={}
   if bert.startswith("guwenbert"):
     from suparkanbun.simplify import simplify
     self.simplify=simplify
   d=os.path.join(DOWNLOAD_DIR,bert+".pos")
   self.tagger=AutoModelTagger(d)
   f=os.path.join(d,bert+".supar")
   self.supar=Parser.load(f)
   if danku:
     d=os.path.join(DOWNLOAD_DIR,bert+".danku")
     self.danku=AutoModelTagger(d,["B","E","E2","E3","M","S"])
   else:
     self.danku=None
   self.gloss=MakeGloss()
Exemple #4
0
import io, os, re
from supar import Parser
# parser = Parser.load("crf-con-electra-zh")
parser = Parser.load("biaffine-dep-electra-zh")

plain_file_path = "../snacs_plain.txt"
supar_ctb_path = "../snacs_supar.ctb"

with io.open(plain_file_path, "r", encoding="utf8") as f:
    lines = [re.split(r" +", x.strip()) for x in f.read().strip().split("\n")]

# lines = lines[:10]

## Method 1: as a whole
# parsed = parser.predict(lines, lang=None, verbose=False)
# parsed = [str(x) for x in parsed]

## Method 2: sentence by sentence
parsed = []
for line_id, line in enumerate(lines):
    if line_id <= 610:
        continue
    tmp = str(parser.predict([line], lang=None, verbose=False)[0])
    parsed.append(tmp)
    print(tmp)
    print("o Done with line: %d" % line_id)

with io.open(supar_ctb_path, "w", encoding="utf8") as f:
    f.write("\n\n".join(parsed) + "\n\n")
from supar import Parser

parser = Parser.load('biaffine-dep-en')

print('Done')
Exemple #6
0
def test_parse():
    sentence = ['The', 'dog', 'chases', 'the', 'cat', '.']
    for name in supar.PRETRAINED:
        parser = Parser.load(name)
        parser.predict([sentence], prob=True)
Exemple #7
0
#%%

import sys
from supar import Parser

#%%

### getting plain text for dependency parsing
# sents_output = [" ".j???oin([token[3] for token in sent]) for sent in sents]
# with open("{}.plain".format(section), "w") as f:
# 	for sent in sents_output:
# 		f.write(sent)
# 		f.write('\n')

#%%
parser_biaffine = Parser.load('biaffine-dep-en')
# parser_biaffine_bert = Parser.load('biaffine-dep-bert-en')
parser_crf_dep_en = Parser.load('crf-dep-en')
parser_crf2o_dep_en = Parser.load('crf2o-dep-en')

#%%
# sents = sents

#%%
dataset_biaffine = parser_biaffine.predict(sents, prob=True, verbose=False)
# dataset_biaffine_bert = parser_biaffine_bert.predict(sents, prob=True, verbose=False)
dataset_crf2o_dep_en = parser_crf2o_dep_en.predict(sents, prob=True, verbose=False)
dataset_crf_dep_en = parser_crf_dep_en.predict(sents, prob=True, verbose=False)

#%%
with open("{}.parsed.biaffine".format(section), "w") as f:
Exemple #8
0
 def __init__(self, model="crf-con-en"):
     super().__init__(Parser.load(model))
Exemple #9
0
 def __init__(self):
     super().__init__()
     self.parser = Parser.load("crf2o-dep-en")
Exemple #10
0
        sents += [container]
        container = []

output = sents
sents_output = [[token[3] for token in sent] for sent in sents]
print(sents_output)
#%%
### getting plain text for dependency parsing
# sents_output = [" ".join([token[3] for token in sent]) for sent in sents]
# with open("{}.plain".format(section), "w") as f:
# 	for sent in sents_output:
# 		f.write(sent)
# 		f.write('\n')

from supar import Parser
parser_biaffine = Parser.load('crf-dep-en')
parser_biaffine = Parser.load('crf-dep-en')
parser_biaffine = Parser.load('crf-dep-en')
dataset_biaffine = parser_biaffine.predict(sents_output,
                                           prob=True,
                                           verbose=False)

for sent, arc in zip(output, dataset_biaffine.arcs):
    if np.random.rand() > 0.5:
        for line, head in zip(sent, arc):
            line[6] = str(head)

print(output[0])
lines = []
for sent in output:
    for line in sent:
model_ind = sys.argv[3]
dep_type = sys.argv[4]
if dep_type == "sd":
	dep_type_ind = "sdeps"
else:
	dep_type_ind = "udeps"
with open("{}.plain".format(section)) as f:
	lines = f.readlines()
	


if parser_ind == 'supar':
	sents = [line.replace('(', '-LRB-').replace(')', '-RRB-').split(' ') for line in lines]

	from supar import Parser
	parser = Parser.load(model_ind)
	dataset = parser.predict(sents, prob=True, verbose=False)

	with open("{}.parsed.{}.{}".format(section, dep_type_ind, model_ind), "w") as f:
		for arc in dataset.arcs:
			f.write(' '.join(map(str, arc)))
			f.write('\n')
		print("finished {} {}".format(parser_ind, "heads"))
	with open("{}.parsed.{}.{}.labels".format(section, dep_type_ind, model_ind), "w") as f:
		for rel in dataset.rels:
			f.write(' '.join(map(str, rel)))
			f.write('\n')
		print("finished {} {}".format(parser_ind, "labels"))

if parser_ind == 'benepar':
	sents = [line.split(' ') for line in lines]