コード例 #1
0
def annotate(sent):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words = []
    for sent in client.annotate(sent).sentences:
        for tok in sent:
            words.append(tok.word)
    return words
コード例 #2
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.annotate(sent)
        tuples = []
        for s in result.sentences:
            word, pos, dep = [], [], []
            for token in s:
                word += [token.word]
                pos += [token.pos]
            edges = s.depparse(mode='enhanced').to_json()
            for e in edges:
                dep.append({
                    'type': e['dep'],
                    'dep': e['dependent'] - 1,
                    'gov': e['governer'] - 1
                })
            tuples.append((word, pos, dep))
        return tuples
コード例 #3
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'ner'],
            server="http://localhost:9000")

        #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar')
    def parse(self, sent):
        result = self.parser.annotate(sent)
        tuples = []
        for sent in result.sentences:
            tokens, pos, ner = [], [], []
            for token in sent:
                tokens += [token.word]
                pos += [token.pos]
                ner += [token.ner]
            tuples.append((tokens, pos, ner))
        return tuples
コード例 #4
0
def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }
コード例 #5
0
ファイル: cleandata.py プロジェクト: JacobLau0513/676-MBTI
def tokenize(df):
    from stanza.nlp.corenlp import CoreNLPClient
    parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize'],
                           server='http://localhost:9000')
    parsed = []
    for item in df['posts']:
        temp = []
        for sentence in item.strip().split('|||'):
            try:
                result = parser.annotate(sentence)
                tokens = []
                for i in range(len(result.sentences)):
                    tokens += result.sentences[i].tokens
                temp.append(' '.join([token.word for token in tokens]))
            except:
                print('error', sentence)
        parsed.append(' <RETURN> '.join(temp))
    df['posts'] = parsed
コード例 #6
0
ファイル: annotate.py プロジェクト: arunchaganty/aeschines
def do_command(args):
    reader = csv.reader(args.input, delimiter="\t")
    header = next(reader)
    assert all(field in header for field in ("id", "text"))

    Tweet = namedtuple("Tweet", header)
    client = CoreNLPClient()
    annotators = "tokenize ssplit lemma pos".split()

    writer = csv.writer(args.output, delimiter="\t")
    writer.writerow(["id", "tokens", "lemmas", "pos_tags"])

    for tweet in tqdm(Tweet(*row) for row in reader):
        doc = client.annotate(tweet.text, annotators)
        tokens, lemmas, pos_tags = [], [], []
        for sentence in doc:
            tokens += sentence.words
            lemmas += sentence.lemmas
            pos_tags += sentence.pos_tags
        writer.writerow([tweet.id, to_psql_array(tokens), to_psql_array(lemmas), to_psql_array(pos_tags)])
コード例 #7
0
def get_annotate(sentence, lower=True):
    # notice return 4 infos
    # todo: handle [Salmonella spp.] -> ['salmonella', 'spp.', '.']
    global client
    if client is None:
        client = CoreNLPClient(
            server='http://localhost:9000',
            default_annotators=['ssplit', 'tokenize', 'pos'])
    tokenize, origin, pos_tag, after = [], [], [], []
    for s in client.annotate(sentence):
        for t in s:
            if lower:
                tokenize.append(t.word.lower()), origin.append(
                    t.originalText.lower()), pos_tag.append(
                        t.pos), after.append(t.after)
            else:
                tokenize.append(t.word), origin.append(
                    t.originalText), pos_tag.append(t.pos), after.append(
                        t.after)
    return tokenize, origin, pos_tag, after
コード例 #8
0
class NLPParser(object):
	"""
	NLP parse, including Part-Of-Speech tagging.
	Attributes
	==========
	parser: StanfordCoreNLP
		the Staford Core NLP parser
	"""
	def __init__(self):
		self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

	def get_ner(self, tokens):
		sent = ' '.join(tokens)
		result = self.parser.annotate(sent)
		ner = []
		for token in result.sentences[0]:
			ner.append(token.ner)
		return ner
コード例 #9
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

    def parse(self, sent):
        result = self.parser.annotate(sent)
        tokens_list, ner_list = [], []
        for sent in result.sentences:
            tokens, ner = [], []
            currNERType = 'O'
            currNER = ''
            for token in sent:
                token_ner = token.ner
                if token_ner not in INTERESTED_STANFORD_EM_TYPES:
                  token_ner = 'O'
                tokens += [token.word]
                if token_ner == 'O':
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNER = ''
                elif token_ner == currNERType:
                  currNER += token.word + ' '
                else:
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNERType = token_ner
                  currNER = token.word + ' '
            if currNER != '':
              ner.append(currNER.strip())
            if len(tokens) == 0 or len(ner) == 0:
              continue
            tokens_list.append(tokens)
            ner_list.append(ner)
        return tokens_list, ner_list
コード例 #10
0
                    type=int,
                    default=50,
                    help="Maximum sequence length")
parser.add_argument('-shuffle', type=int, default=1, help="Shuffle data")
parser.add_argument('-seed', type=int, default=3435, help="Random seed")

parser.add_argument('-lower', action='store_true', help='lowercase data')

parser.add_argument('-report_every',
                    type=int,
                    default=100000,
                    help="Report status every this many sentences")

opt = parser.parse_args()

corenlp = CoreNLPClient(default_annotators=['tokenize', 'ssplit'])


def annotate_sentence(corenlp, gloss):
    try:
        parse = corenlp.annotate(gloss)
    except:
        time.sleep(10)
        parse = corenlp.annotate(gloss)
    token_str = ' '.join([
        token['word'] for sentence in parse.json['sentence']
        for token in sentence['token']
    ])
    #return parse.json['sentence'][0]['token']
    return token_str
コード例 #11
0
            'text': para_string,
            'fbEMs': entities_fb,
            'dbEMs': entities_db,
            'sentences': sentences
        }
        paragraph_count += 1
        output_file.write(json.dumps(paragraph) + "\n")
    wiki_file.close()
    output_file.close()


''' Takes path to directory with folders and files extracted using WikiExtractor
    and path to output directory as inputs. Writes the sentences with required extracted information
    as Json files in output directory. 
    The Wikipedia dumps are extracted using the following command (preserve links)-

    python WikiExtractor.py -l enwiki-20160920-pages-articles-multistream.xml

'''

if __name__ == "__main__":
    input_file_path = sys.argv[1]
    output_file_path = sys.argv[2]

    corenlp_client = CoreNLPClient(
        server='http://localhost:9000',
        default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner'])

    sparql = SPARQLWrapper.SPARQLWrapper("http://localhost:8890/sparql/")
    process_file(input_file_path, sparql, corenlp_client, output_file_path)
コード例 #12
0
ファイル: tkn.py プロジェクト: harshita-gupta/paper5
import unicodedata

#consider using notebooks

from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000',
                       default_annotators=['ssplit', 'tokenize'])

marquez = 'marquez/marquez'  # encoded in latin-1
ulysses = 'ulysses/ulysses'  # encoded in utf-8
emma = 'emma/emma'  # encoded in utf-8

with open(marquez + "token", 'w') as writeFile:
    for i in range(1, 21):
        filename = marquez + "%02d" % i
        with open(filename, 'r') as myfile:
            chapter = unicode(myfile.read(), encoding='latin-1')
            chapter = unicodedata.normalize('NFD', chapter)
            chapter = chapter.encode('ascii', 'ignore')

        annotated = client.annotate(chapter)
        for sentence in annotated:
            for token in sentence:
                writeFile.write(token.word.encode('ascii', 'ignore'))
                writeFile.write("\n")
コード例 #13
0
 def __init__(self):
     self.parser = CoreNLPClient(
         default_annotators=['ssplit', 'tokenize', 'ner'],
         server='http://localhost:9001')
コード例 #14
0
                state = {'model': model.state_dict()}
                torch.save(state, os.path.join('.', 'model_best.pt'))

                state = {'model_bert': model_bert.state_dict()}
                torch.save(state, os.path.join('.', 'model_bert_best.pt'))

            print(f" Best Dev lx acc: {acc_lx_t_best} at epoch: {epoch_best}")

    if args.do_infer:
        # To use recent corenlp: https://github.com/stanfordnlp/python-stanford-corenlp
        # 1. pip install stanford-corenlp
        # 2. download java crsion
        # 3. export CORENLP_HOME=/Users/wonseok/utils/stanford-corenlp-full-2018-10-05

        from stanza.nlp.corenlp import CoreNLPClient
        client = CoreNLPClient(server='http://localhost:9000',
                               default_annotators='ssplit,tokenize'.split(','))

        # import corenlp
        #
        # client = corenlp.CoreNLPClient(annotators='ssplit,tokenize'.split(','))

        nlu1 = "Which company have more than 100 employees?"
        path_db = './data_and_model'
        db_name = 'dev'
        data_table = load_jsonl('./data_and_model/dev.tables.jsonl')
        table_name = 'table_10015132_11'  # change table here depending on the questions.
        n_Q = 100000 if args.infer_loop else 1
        for i in range(n_Q):
            if n_Q > 1:
                nlu1 = input('Type question: ')
            pr_sql_i, pr_ans = infer(nlu1,
コード例 #15
0
 def __init__(self):
     self.parser = CoreNLPClient(
         default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])
コード例 #16
0
 def _client(cls):
     if cls.__client is None:
         cls.__client = CoreNLPClient(server=config.CORENLP_SERVER, default_annotators=config.CORENLP_ANNOTATORS)
     return cls.__client
コード例 #17
0
from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner'])
annotated = client.annotate('This is an example document. Here is a second sentence')
for sentence in annotated.sentences:
    print('sentence', sentence)
    for token in sentence:
        print(token.word, token.lemma, token.pos, token.ner)
コード例 #18
0
def corenlp_tokenize(s: str) -> Sequence[str]:
    global corenlp_client
    if "corenlp_client" not in globals():
        corenlp_client = CoreNLPClient(
            default_annotators=["ssplit", "tokenize"])
    return [t.word for s in corenlp_client.annotate(s).sentences for t in s]