Python CoreNLPClient Beispiele, stanza.nlp.corenlp.CoreNLPClient Python Beispiele

Beispiel #1

0

Datei anzeigen

def annotate(sent):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words = []
    for sent in client.annotate(sent).sentences:
        for tok in sent:
            words.append(tok.word)
    return words

Beispiel #2

0

Datei anzeigen

class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.annotate(sent)
        tuples = []
        for s in result.sentences:
            word, pos, dep = [], [], []
            for token in s:
                word += [token.word]
                pos += [token.pos]
            edges = s.depparse(mode='enhanced').to_json()
            for e in edges:
                dep.append({
                    'type': e['dep'],
                    'dep': e['dependent'] - 1,
                    'gov': e['governer'] - 1
                })
            tuples.append((word, pos, dep))
        return tuples

Beispiel #3

0

Datei anzeigen

class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'ner'],
            server="http://localhost:9000")

        #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar')
    def parse(self, sent):
        result = self.parser.annotate(sent)
        tuples = []
        for sent in result.sentences:
            tokens, pos, ner = [], [], []
            for token in sent:
                tokens += [token.word]
                pos += [token.pos]
                ner += [token.ner]
            tuples.append((tokens, pos, ner))
        return tuples

Beispiel #4

0

Datei anzeigen

def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }

Beispiel #5

0

Datei anzeigen

Datei: cleandata.py Projekt: JacobLau0513/676-MBTI

def tokenize(df):
    from stanza.nlp.corenlp import CoreNLPClient
    parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize'],
                           server='http://localhost:9000')
    parsed = []
    for item in df['posts']:
        temp = []
        for sentence in item.strip().split('|||'):
            try:
                result = parser.annotate(sentence)
                tokens = []
                for i in range(len(result.sentences)):
                    tokens += result.sentences[i].tokens
                temp.append(' '.join([token.word for token in tokens]))
            except:
                print('error', sentence)
        parsed.append(' <RETURN> '.join(temp))
    df['posts'] = parsed

Beispiel #6

0

Datei anzeigen

Datei: annotate.py Projekt: arunchaganty/aeschines

def do_command(args):
    reader = csv.reader(args.input, delimiter="\t")
    header = next(reader)
    assert all(field in header for field in ("id", "text"))

    Tweet = namedtuple("Tweet", header)
    client = CoreNLPClient()
    annotators = "tokenize ssplit lemma pos".split()

    writer = csv.writer(args.output, delimiter="\t")
    writer.writerow(["id", "tokens", "lemmas", "pos_tags"])

    for tweet in tqdm(Tweet(*row) for row in reader):
        doc = client.annotate(tweet.text, annotators)
        tokens, lemmas, pos_tags = [], [], []
        for sentence in doc:
            tokens += sentence.words
            lemmas += sentence.lemmas
            pos_tags += sentence.pos_tags
        writer.writerow([tweet.id, to_psql_array(tokens), to_psql_array(lemmas), to_psql_array(pos_tags)])

Beispiel #7

0

Datei anzeigen

Datei: utils.py Projekt: Dspirit0908/Attn_Binding_System

def get_annotate(sentence, lower=True):
    # notice return 4 infos
    # todo: handle [Salmonella spp.] -> ['salmonella', 'spp.', '.']
    global client
    if client is None:
        client = CoreNLPClient(
            server='http://localhost:9000',
            default_annotators=['ssplit', 'tokenize', 'pos'])
    tokenize, origin, pos_tag, after = [], [], [], []
    for s in client.annotate(sentence):
        for t in s:
            if lower:
                tokenize.append(t.word.lower()), origin.append(
                    t.originalText.lower()), pos_tag.append(
                        t.pos), after.append(t.after)
            else:
                tokenize.append(t.word), origin.append(
                    t.originalText), pos_tag.append(t.pos), after.append(
                        t.after)
    return tokenize, origin, pos_tag, after

Beispiel #8

0

Datei anzeigen

Datei: cotype2json.py Projekt: xnz535264581/DS-RelationExtraction

class NLPParser(object):
	"""
	NLP parse, including Part-Of-Speech tagging.
	Attributes
	==========
	parser: StanfordCoreNLP
		the Staford Core NLP parser
	"""
	def __init__(self):
		self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

	def get_ner(self, tokens):
		sent = ' '.join(tokens)
		result = self.parser.annotate(sent)
		ner = []
		for token in result.sentences[0]:
			ner.append(token.ner)
		return ner

Beispiel #9

0

Datei anzeigen

class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

    def parse(self, sent):
        result = self.parser.annotate(sent)
        tokens_list, ner_list = [], []
        for sent in result.sentences:
            tokens, ner = [], []
            currNERType = 'O'
            currNER = ''
            for token in sent:
                token_ner = token.ner
                if token_ner not in INTERESTED_STANFORD_EM_TYPES:
                  token_ner = 'O'
                tokens += [token.word]
                if token_ner == 'O':
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNER = ''
                elif token_ner == currNERType:
                  currNER += token.word + ' '
                else:
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNERType = token_ner
                  currNER = token.word + ' '
            if currNER != '':
              ner.append(currNER.strip())
            if len(tokens) == 0 or len(ner) == 0:
              continue
            tokens_list.append(tokens)
            ner_list.append(ner)
        return tokens_list, ner_list

Beispiel #10

0

Datei anzeigen

                    type=int,
                    default=50,
                    help="Maximum sequence length")
parser.add_argument('-shuffle', type=int, default=1, help="Shuffle data")
parser.add_argument('-seed', type=int, default=3435, help="Random seed")

parser.add_argument('-lower', action='store_true', help='lowercase data')

parser.add_argument('-report_every',
                    type=int,
                    default=100000,
                    help="Report status every this many sentences")

opt = parser.parse_args()

corenlp = CoreNLPClient(default_annotators=['tokenize', 'ssplit'])


def annotate_sentence(corenlp, gloss):
    try:
        parse = corenlp.annotate(gloss)
    except:
        time.sleep(10)
        parse = corenlp.annotate(gloss)
    token_str = ' '.join([
        token['word'] for sentence in parse.json['sentence']
        for token in sentence['token']
    ])
    #return parse.json['sentence'][0]['token']
    return token_str

Beispiel #11

0

Datei anzeigen

Datei: create_final.py Projekt: abarthakur/create_distantly_supervised_dataset

            'text': para_string,
            'fbEMs': entities_fb,
            'dbEMs': entities_db,
            'sentences': sentences
        }
        paragraph_count += 1
        output_file.write(json.dumps(paragraph) + "\n")
    wiki_file.close()
    output_file.close()


''' Takes path to directory with folders and files extracted using WikiExtractor
    and path to output directory as inputs. Writes the sentences with required extracted information
    as Json files in output directory. 
    The Wikipedia dumps are extracted using the following command (preserve links)-

    python WikiExtractor.py -l enwiki-20160920-pages-articles-multistream.xml

'''

if __name__ == "__main__":
    input_file_path = sys.argv[1]
    output_file_path = sys.argv[2]

    corenlp_client = CoreNLPClient(
        server='http://localhost:9000',
        default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner'])

    sparql = SPARQLWrapper.SPARQLWrapper("http://localhost:8890/sparql/")
    process_file(input_file_path, sparql, corenlp_client, output_file_path)

Beispiel #12

0

Datei anzeigen

Datei: tkn.py Projekt: harshita-gupta/paper5

import unicodedata

#consider using notebooks

from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000',
                       default_annotators=['ssplit', 'tokenize'])

marquez = 'marquez/marquez'  # encoded in latin-1
ulysses = 'ulysses/ulysses'  # encoded in utf-8
emma = 'emma/emma'  # encoded in utf-8

with open(marquez + "token", 'w') as writeFile:
    for i in range(1, 21):
        filename = marquez + "%02d" % i
        with open(filename, 'r') as myfile:
            chapter = unicode(myfile.read(), encoding='latin-1')
            chapter = unicodedata.normalize('NFD', chapter)
            chapter = chapter.encode('ascii', 'ignore')

        annotated = client.annotate(chapter)
        for sentence in annotated:
            for token in sentence:
                writeFile.write(token.word.encode('ascii', 'ignore'))
                writeFile.write("\n")

Beispiel #13

0

Datei anzeigen

Datei: add_ner.py Projekt: cherry979988/feedforward-RE

 def __init__(self):
     self.parser = CoreNLPClient(
         default_annotators=['ssplit', 'tokenize', 'ner'],
         server='http://localhost:9001')

Beispiel #14

0

Datei anzeigen

                state = {'model': model.state_dict()}
                torch.save(state, os.path.join('.', 'model_best.pt'))

                state = {'model_bert': model_bert.state_dict()}
                torch.save(state, os.path.join('.', 'model_bert_best.pt'))

            print(f" Best Dev lx acc: {acc_lx_t_best} at epoch: {epoch_best}")

    if args.do_infer:
        # To use recent corenlp: https://github.com/stanfordnlp/python-stanford-corenlp
        # 1. pip install stanford-corenlp
        # 2. download java crsion
        # 3. export CORENLP_HOME=/Users/wonseok/utils/stanford-corenlp-full-2018-10-05

        from stanza.nlp.corenlp import CoreNLPClient
        client = CoreNLPClient(server='http://localhost:9000',
                               default_annotators='ssplit,tokenize'.split(','))

        # import corenlp
        #
        # client = corenlp.CoreNLPClient(annotators='ssplit,tokenize'.split(','))

        nlu1 = "Which company have more than 100 employees?"
        path_db = './data_and_model'
        db_name = 'dev'
        data_table = load_jsonl('./data_and_model/dev.tables.jsonl')
        table_name = 'table_10015132_11'  # change table here depending on the questions.
        n_Q = 100000 if args.infer_loop else 1
        for i in range(n_Q):
            if n_Q > 1:
                nlu1 = input('Type question: ')
            pr_sql_i, pr_ans = infer(nlu1,

Beispiel #15

0

Datei anzeigen

 def __init__(self):
     self.parser = CoreNLPClient(
         default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])

Beispiel #16

0

Datei anzeigen

 def _client(cls):
     if cls.__client is None:
         cls.__client = CoreNLPClient(server=config.CORENLP_SERVER, default_annotators=config.CORENLP_ANNOTATORS)
     return cls.__client

Beispiel #17

0

Datei anzeigen

from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner'])
annotated = client.annotate('This is an example document. Here is a second sentence')
for sentence in annotated.sentences:
    print('sentence', sentence)
    for token in sentence:
        print(token.word, token.lemma, token.pos, token.ner)

Beispiel #18

0

Datei anzeigen

def corenlp_tokenize(s: str) -> Sequence[str]:
    global corenlp_client
    if "corenlp_client" not in globals():
        corenlp_client = CoreNLPClient(
            default_annotators=["ssplit", "tokenize"])
    return [t.word for s in corenlp_client.annotate(s).sentences for t in s]