Beispiel #1
0
def annotate(sentence, lower=True):
    global client
   
    nlp = CoreNLPParser('http://localhost:9000')

    res = nlp.api_call(sentence,properties={'annotators': 'tokenize,ssplit'})     

        
    words, gloss, after = [], [], []
    
    print(sentence)
    for t in res['sentences']:
        for i in range(len(t['tokens'])):
            words.append(t['tokens'][i]['word'])
            gloss.append(t['tokens'][i]['originalText'])
            after.append(t['tokens'][i]['after'])
    if lower:
        words = [w.lower() for w in words]
    a={
        'gloss': gloss,
        'words': words,
        'after': after,
        }    
    print(a)        
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
        }
Beispiel #2
0
def formulate_question(question_sentence):
    """
    Formulates a Question object from question_sentence
    :param question_sentence: a string of the question sentence
    :return: a Question object representing the dependency structure of the question
    """
    # find the "question word" (see: "5 W's", "WH word") for the question
    q_parsed = next(CoreNLPParser().raw_parse(question_sentence))
    q_word = None
    # try out the normal constructions to find a question
    for subtree in q_parsed.subtrees():
        if subtree.label() in ["SBARQ", "SBAR", "SINV"]:
            for sub_subtree in subtree.subtrees():
                if sub_subtree.label(
                )[0] == "W" and sub_subtree.label()[0:2] != "WH":
                    q_word = (sub_subtree.leaves()[0], sub_subtree.label())
                    break
            break
    # the normal constructions didn't work; just grab the first question word
    if q_word is None:
        for subtree in q_parsed.subtrees():
            if subtree.label()[0] == "W" and subtree.label()[0:2] != "WH":
                q_word = (subtree.leaves()[0], subtree.label())

    return Question(get_dependency_parse(question_sentence), q_word)
Beispiel #3
0
def start_testing(trained_model_file):
    parser = CoreNLPParser(url='http://localhost:9080')

    emotions = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']

    dirname = os.path.dirname(os.path.realpath(__file__)) + "/"

    glove_model = read_glove_vectors(dirname + "Pickle/gloveModel")

    hidden_size = 256
    num_layers = 2
    bidirectional = False
    batchnorm = False
    dropout_hidden = 0.3
    dropout_output = 0.9
    model = LSTM(300, hidden_size, num_layers, bidirectional, batchnorm,
                 dropout_hidden, dropout_output).to(device)

    with torch.no_grad():
        model.load_state_dict(torch.load(trained_model_file))
        print(model)
        model.eval()
        while True:
            test_sentence = input("Give a test sentence: ")
            sentence = list(parser.tokenize(test_sentence))
            input1, sent_length = get_input_vector(glove_model, sentence)
            class_pred = model(input1, sent_length)
            print("Sentence: " + test_sentence)
            _, pred = class_pred.max(dim=1)
            print("Prediction:\t" + emotions[pred[0]])
            print("Output Values:")
            percentages = torch.nn.functional.softmax(class_pred, dim=1) * 100
            for i in range(len(emotions)):
                print(emotions[i] + " %" +
                      str(percentages.data.tolist()[0][i]))
def 提華語句法樹(bunji="我 喜歡 豬", url='http://localhost:9000'):
    try:
        句法分析器 = CoreNLPParser(url=url)
    except Warning as 錯誤:
        print('Warning=', 錯誤)

    分析結果指標 = 句法分析器.parse(simplify(bunji).split())
    該句結果字串 = next(分析結果指標)

    return 該句結果字串

    # 印字串
    # (ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪)))))
    print('該句結果字串=', 該句結果字串)

    # 照字串印樹仔圖
    # ROOT
    #      |
    #      IP
    #   ___|____
    #  |        VP
    #  |    ____|___
    #  NP  |        NP
    #  |   |        |
    #  PN  VV       NN
    #  |   |        |
    #  我   喜欢       猪
    該句結果字串.pretty_print()

    ##### 樹仔字串提出原始字串
    a = Tree.fromstring("(ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪)))))")
    # ['我', '喜欢', '猪']
    print(a.leaves())
    # (ROOT 我 喜欢 猪)
    print(a.flatten())
    def run_nlp(self, language):
        # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) :
        # might need root
        # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000
        # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future
        # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period.
        self.__check_language(language)
        util.time_log("starting NLP...")
        annotator_dict = {"annotators": "sentiment"}
        classifier = CoreNLPParser("http://localhost:9000")

        ret_list = []

        for k_iter in range(0, self.k):
            prediction = []
            for review in self.test_data_text(language, k_iter):
                response_dict = classifier.api_call(review,
                                                    properties=annotator_dict,
                                                    timeout=500)
                count = 0
                sentiment = 0.0
                for sentence in response_dict["sentences"]:
                    count += 1
                    sentiment += float(sentence["sentimentValue"])

                avg_sentiment = sentiment / count
                # a lot better results with >=2
                prediction.append(1 if avg_sentiment >= 2 else 0)
            ret_list.append(prediction)
        return ret_list
Beispiel #6
0
def convert_eng_to_isl(input_string):

    if len(list(input_string.split(' '))) is 1:
        return list(input_string.split(' '))

    # Initializing stanford parser
    parser = CoreNLPParser()

    # Generates all possible parse trees sort by probability for the sentence
    possible_parse_tree_list = [tree for tree in parser.parse(input_string.split())]

    # Get most probable parse tree
    parse_tree = possible_parse_tree_list[0]
    # print(parse_tree)
    # output = '(ROOT
    #               (S
    #                   (PP (IN As) (NP (DT an) (NN accountant)))
    #                   (NP (PRP I))
    #                   (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))
    #                )
    #             )'

    # Convert into tree data structure
    parent_tree = ParentedTree.convert(parse_tree)    
    print("\n\nParse Tree:\n")
    print(parent_tree)   

    modified_parse_tree = modify_tree_structure(parent_tree)
    print("\n\nModified Parse Tree:\n")
    print(modified_parse_tree)

    isl_sentence = modified_parse_tree.leaves()
    return isl_sentence
def create_dataset_bin(annotation_file, data_file):
    parser = CoreNLPParser(url='http://localhost:9080')
    dirname = os.path.dirname(os.path.realpath(__file__)) + "/"

    dataset = []

    with open(annotation_file, "r") as file1, open(data_file, "r") as file2:
        for line_from_file_1, line_from_file_2 in zip(file1, file2):
            output = None
            line1 = line_from_file_1.split()
            line2 = line_from_file_2
            if line1[0] == "ne":
                output = 7
            elif line1[0] == "hp":
                output = 0
            elif line1[0] == "sd":
                output = 1
            elif line1[0] == "ag":
                output = 2
            elif line1[0] == "dg":
                output = 3
            elif line1[0] == "sp":
                output = 4
            elif line1[0] == "fr":
                output = 5
            elif line1[0] == "me":
                output = 6
            dataset.append((output, list(parser.tokenize(line2))))
    print(len(dataset))

    with open(dirname + "Pickle/dataset_ready", 'wb') as outfile:
        cPickle.dump(dataset, outfile)
Beispiel #8
0
 def __init__(self, fo_lang_code):
     # set up stanford nlp java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload
     # tokenize,ssplit,pos,lemma,parse,depparse  -status_port 9000 -port 9000 -timeout 15000 -serverProperties StanfordCoreNLP-
     # chinese.properties
     self.parser = CoreNLPParser()
     self.fo_lang_code = fo_lang_code
     self.preprocessor = Preprocessor()
    def parse_tree(self, s):
        parser = CoreNLPParser()

        parse = next(parser.raw_parse(s))
        # parse.draw()

        return parse
Beispiel #10
0
 def test_parse_for_multiple(self):
     sentence = "George and Mary ate dinner."
     parsed = next(CoreNLPParser().raw_parse(sentence))
     self.assertEqual(
         (parse_for(parsed, "NNP")[0].leaves()[0], parse_for(
             parsed, "NNP")[1].leaves()[0]), ("George", "Mary"))
     self.assertIsNone(parse_for(parsed, "NOPE"))
Beispiel #11
0
 def _create_parser(url):
     try:
         parser = CoreNLPParser(url=url)
         parser.raw_parse('This is a test sentence.')
     except Exception:
         parser = None
     return parser
Beispiel #12
0
def get_bigram_and_deep_syntax_feature(review, speller, stop_words, ps, preprocess):
    res = ""
    productions = []

    parser = CoreNLPParser(url='http://localhost:9500')

    for sentence in re.split(r"[.!?]", review):
        try:
            tree = next(parser.raw_parse(sentence))

            # Optimize by creating Chomsky normal form
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)
            productions += tree.productions()

        except StopIteration:
            # End of review reached
            break

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)

    count = 0
    for line in str(grammar).split("\n"):
        if count == 0:
            count += 1
            continue
        elif "'" in line:
            res += re.sub(r"[(->) `\'\"\[\d\]]", "", line) + " "

    res += bipos.get_bigrams_and_unigrams_of_sentence(
        bow.sanitize_sentence(review, speller, stop_words, ps, preprocess))

    return res
Beispiel #13
0
def getNERs(ws):
    from nltk.parse.corenlp import CoreNLPParser
    from textcrafts.corenlp_api import parserURL
    parser = CoreNLPParser(url=parserURL, tagtype='ner')
    ts = parser.tag(ws)
    for t in ts:
        if t[1] != 'O':
            yield t
 def __init__(self, sentence):
     config = ApplicationConfig.get_corenlp_config()
     self._parser = CoreNLPParser(url=f"http://{config['host']}:{config['port']}")
     self._dependency = CoreNLPDependencyParser(url=f"http://{config['host']}:{config['port']}")
     sentence = sentence.replace('  ', ' ')
     sentence = sentence.replace('.', '')
     self._load(sentence)
     self.original = sentence
def main():
    parser = ArgumentParser()
    parser.add_argument(
        '--data_file',
        default='../../data/facebook-maria/combined_group_data.tsv')
    args = vars(parser.parse_args())

    ## load data
    combined_data = pd.read_csv(args['data_file'], sep='\t', index_col=False)
    # only Spanish
    combined_data = combined_data[combined_data.loc[:, 'status_lang'] == 'es']
    # tmp debugging
    # combined_data = combined_data.head(100)

    ## clean data
    parser = CoreNLPParser(url='http://localhost:9003', tagtype='ner')
    combined_data.loc[:,
                      'status_message_clean'] = combined_data.loc[:,
                                                                  'status_message'].apply(
                                                                      lambda x:
                                                                      clean_txt(
                                                                          x,
                                                                          parser
                                                                      ))
    combined_data = combined_data[
        combined_data.loc[:, 'status_message_clean'].apply(lambda x: x != '')]

    ## tag NEs
    combined_data_tags = []
    for i, combined_data_i in combined_data.iterrows():
        x = combined_data_i.loc['status_message_clean']
        try:
            x_tags = parser.tag(parser.tokenize(x))
        except Exception as e:
            print('problem with status %s' % (x))
            print('original status %s' %
                  (combined_data_i.loc['status_message']))
            x_tags = []
        combined_data_tags.append(x_tags)
    combined_data_tags = pd.Series(combined_data_tags,
                                   index=combined_data.index)
    #     combined_data_tags = pd.Series(combined_data.loc[:, 'status_message_clean'].apply(lambda x: parser.tag(parser.tokenize(x))))
    combined_data_tags_ne = combined_data_tags.apply(extract_NEs)
    combined_data.loc[:, 'status_message_tags'] = combined_data_tags
    combined_data.loc[:, 'status_message_tags_ne'] = combined_data_tags_ne
    # check for float??
    for i, combined_data_i in combined_data.iterrows():
        if (type(combined_data_i.loc['status_message_tags']) is float):
            print('error with data %s' % (combined_data_i))

    ## generate tagged/stemmed statuses
    combined_data.loc[:,
                      'status_message_ne_tagged_stemmed'] = combined_data.loc[:, 'status_message_tags'].apply(
                          lambda x: process_status_tags(x, STEMMER))

    ## write to file
    out_file = args['data_file'].replace('.tsv', '_es_tagged.tsv')
    combined_data.to_csv(out_file, sep='\t', index=False)
Beispiel #16
0
    def convert_sentence_to_ids(self, sentence: Union[str, list]):
        if not self.parser:
            self.parser = CoreNLPParser(url='http://localhost:9000',
                                        tagtype='pos')

        tags = self.convert_sentence_to_tags(sentence)
        ids = self.convert_tags_to_ids(tags)
        print(type(sentence), len(sentence), len(tags), len(ids))
        return list(ids)
Beispiel #17
0
 def __init__(self, tag_id_initialized=False, tag_id=None, uncased=True):
     self.uncased = uncased
     self.tag_id_initialized = tag_id_initialized
     if tag_id_initialized:
         self.tag_to_id = tag_id
     else:
         self.tag_to_id = {"CLSSEP": 0, "UNKNOWN": 1}
     self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
     self.basic_tokenizer = BasicTokenizer()
Beispiel #18
0
    def __init__(self):
        self.synset_example = {}
        self.tokenizer = CoreNLPParser(url='http://localhost:42636')

        self.use_babelnet = use_extended_gloss
        if self.use_babelnet:
            from py4j.java_gateway import JavaGateway
            gateway = JavaGateway()
            self.sense = gateway.entry_point
Beispiel #19
0
def get_postagger_for_criterion(criterion):
    #ini_path = "/stanford/postagger"
    #os.environ['STANFORD_PARSER'] = ini_path
    #os.environ['STANFORD_MODELS'] = ini_path
    #os.environ['CLASSPATH'] = ini_path
    
    st = CoreNLPParser(url=os.environ['STANFORD_NLP_TOOLS'], tagtype='pos')
    postagger_list = st.tag(criterion)
    return postagger_list
Beispiel #20
0
def build_vocab(json:str, threshold:int, keeppunctuation: bool, host_address:str, character_level:bool=False, zh:bool=True ):
    """Build vocabulary from csv file with a given threshold to drop all counts < threshold

    Args:
        csv (string): Input csv file. Needs to be tab separated and having a column named 'caption'
        
        Modiefied:
        json(string): Input json file. Shoud have a column named 'caption'
        threshold (int): Threshold to drop all words with counts < threshold
        keeppunctuation (bool): Includes or excludes punctuation.

    Returns:
        vocab (Vocab): Object with the processed vocabulary
    """
    #df = pd.read_csv(csv, sep='\t')
    df = pd.read_json(json)
    counter = Counter()
    
    if zh:
        parser = CoreNLPParser(host_address)
        for i in tqdm(range(len(df)), leave=False):
            caption = str(df.loc[i]['caption'])
            # Remove all punctuations
            if not keeppunctuation:
                caption = re.sub("[{}]".format(punctuation),"",caption)
            if character_level:
                tokens = list(caption)
            else:
                tokens = list(parser.tokenize(caption))
            counter.update(tokens)
    else:
        punctuation = ',.()'
        for i in tqdm(range(len(df)), leave=False):
            caption = str(df.loc[i]['caption'])
            # Remove all punctuations
            if not keeppunctuation:
                caption = re.sub("[{}]".format(punctuation),"",caption)
            if character_level:
                tokens = list(caption)
            else:
                tokens = caption.split()
            counter.update(tokens)

    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
Beispiel #21
0
 def __init__(self):
     # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html
     self.additional_properties = {
         'tokenize.options':
         'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false',
         'annotators': 'tokenize, ssplit, pos, lemma'
     }
     self.stanford_parser = CoreNLPParser()
     # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB.
     internals.config_java(options='-xmx4G')
Beispiel #22
0
    def __init__(self, url='http://localhost:9000', encoding='utf8'):
        """Start the parsers to make sure they're running before calling.

        CoreNLP runs by default on port 9000, but if an external server is used
          or a different port is selected when started, the url will need to be
          explicitly passed.
        """
        self.NERT = CoreNLPNERTagger(url=url)
        self.Parser = CoreNLPParser(url=url, encoding=encoding)
        self.dep_parser = DepParser(url=url)
Beispiel #23
0
def tokenize(Config):
    parser = CoreNLPParser(url='http://localhost:' + Config["servport"],
                           tagtype='pos')
    inPath = Config["home"] + "/" + Config["source_path"]
    outPath = Config["home"] + "/" + Config["target_path"]

    fds = datetime.datetime.now()
    tokenize_data(Config, parser, inPath, outPath)
    fde = datetime.datetime.now()
    print("Tokenization complited in %s" % (get_formatted_date(fds, fde)))
Beispiel #24
0
    def test_parse_for_q_to_a(self):
        # parse question
        question = "Where did Fred find the cookies?"
        q_parsed = next(CoreNLPParser().raw_parse(question))
        q_named = parse_for(q_parsed, "NNP")
        q_vp = parse_for(q_parsed, "VP")
        q_vb = parse_for(q_vp[0], "VB")
        q_obj = parse_for(q_vp[0], "NP")
        self.assertEqual(1, len(q_named))
        self.assertEqual("Fred", " ".join(q_named[0].leaves()))
        self.assertEqual(1, len(q_vp))
        self.assertEqual("find the cookies", " ".join(q_vp[0].leaves()))
        self.assertEqual(1, len(q_vb))
        self.assertEqual("find", " ".join(q_vb[0].leaves()))
        self.assertEqual(1, len(q_obj))
        self.assertEqual("the cookies", " ".join(q_obj[0].leaves()))

        # parse answer
        answer = "Fred found the cookies in the cupboard."
        a_parsed = next(CoreNLPParser().raw_parse(answer))
        a_named = parse_for(a_parsed, "NNP")
        a_vp = parse_for(a_parsed, "VP")
        a_vbd = parse_for(a_vp[0], "VBD")
        a_nps = parse_for(a_vp[0], "NP")
        a_pp = parse_for(a_vp[0], "PP")
        self.assertEqual(1, len(a_named))
        self.assertEqual("Fred", " ".join(a_named[0].leaves()))
        self.assertEqual(1, len(a_vp))
        self.assertEqual("found the cookies in the cupboard",
                         " ".join(a_vp[0].leaves()))
        self.assertEqual(1, len(a_vbd))
        self.assertEqual("found", " ".join(a_vbd[0].leaves()))
        self.assertEqual(2, len(a_nps))
        self.assertEqual("the cookies", " ".join(a_nps[0].leaves()))
        self.assertEqual("the cupboard", " ".join(a_nps[1].leaves()))
        self.assertEqual(1, len(a_pp))
        self.assertEqual("in the cupboard", " ".join(a_pp[0].leaves()))

        # "find" (verify) answer of question
        self.assertEqual(q_named, a_named)
        # self.assertEqual(q_vbd, a_vbd)     # but past tense, though!
        self.assertEqual(q_obj[0], a_nps[0])
Beispiel #25
0
    def tokenize(self):
        parser = CoreNLPParser(url='http://localhost:' +
                               self.Config["servport"],
                               tagtype='pos')
        inPath = self.Config["home"] + "/" + self.Config["sourcepath"]
        outPath = self.Config["home"] + "/" + self.Config["targetpath"]

        fds = datetime.datetime.now()
        self.tokenizeData(parser, inPath, outPath)
        fde = datetime.datetime.now()
        print("Tokenization complited in %s" % (showTime(fds, fde)))
Beispiel #26
0
    def __init__(
        self,
        url: str = 'http://localhost:9000',
        encoding: str = 'utf8',
    ):
        """
        Parameters
        ----------

        see https://www.nltk.org/api/nltk.parse.html#nltk.parse.corenlp.CoreNLPParser
        """
        self._make_parser = lambda: CoreNLPParser(url, encoding, 'pos')
Beispiel #27
0
def stanford_nlp():
    parser = CoreNLPParser()
    text1 = "There is still a place for mercenaries working for NGOs."
    text2 = "The Rich Poor Gap Silences the Political Voice of the Poor"
    text3 = "Legislation against mercenaries"
    for text in [text1, text2, text3]:
        parse = next(parser.raw_parse(text))
        print(parse)
        has_sent = False
        for item in parse.subtrees():
            if item.label() == "S":
                has_sent = True
        print(has_sent)
Beispiel #28
0
def tokenize_and_write_to_tokenresult(text, dest):
    #https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK
    url = 'http://localhost:9000'
    dep_parser = CoreNLPDependencyParser(url=url)
    tokens = CoreNLPParser(url)

    res = tokens.tokenize(text)

    for token in res:
        if token == '.':
            dest.write(token.lower() + '\n')
        else:
            dest.write(token.lower() + ' ')
def load_spanish_tagger(port='8893'):
    """
    Load Spanish NER tagger.
    Default to Stanford tagger because we can't
    find Twitter pre-trained tagger. FML
    Assumes that server is already running as follows:
    cd /hg190/corpora/StanfordCoreNLP/stanford-corenlp-full-2018-02-27/tmp/stanford-corenlp-full-2018-02-27/
    java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties StanfordCoreNLP-spanish.properties -preload tokenize,ssplit,pos,ner,parse -status_port 9003  -port 9003 -timeout 15000
    
    :returns tagger:: Spanish NER tagger
    """
    parser = CoreNLPParser('http://localhost:9003', tagtype='ner')
    tagger = StanfordTaggerWrapper(parser)
    return tagger
Beispiel #30
0
    def __init__(
        self,
        url: str = 'http://localhost:9000',
        encoding: str = 'utf-8',
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
    ):
        self._parser = CoreNLPParser(url, encoding, 'pos')

        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._end_tokens = end_tokens or []