Esempio n. 1
0
class NLTK_NLP():

    def __init__(self, ip_port):
        self.dep_parser = CoreNLPDependencyParser(url=ip_port)
        self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner')
        self.parser = CoreNLPParser(url=ip_port)
        self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos')

    def generate_dependency_tree(self, sentence):
        '''what is the name of the asteroid ?'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return dependency_tree

    def generate_dependency_graph(self, sentence):
        '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'}
        7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return DependencyGraph(dependency_tree.to_conll(10))

    def generate_constituency_tree(self, sentence):
        '''input: one question'''
        tree_list = list(self.parser.raw_parse(sentence=sentence))
        return tree_list[0]

    def get_pos(self, sentence):
        '''What is the airspeed of an unladen swallow ?
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        '''
        pos_list = list(self.pos_tagger.tag(sentence.split()))
        # tokens = nltk.word_tokenize(sentence)
        # wordpos = nltk.pos_tag(tokens)
        return pos_list

    def get_pos_by_tokens(self, tokens):
        '''What is the airspeed of an unladen swallow ?'''
        pos_list = list(self.pos_tagger.tag(tokens))
        return pos_list

    def get_ner(self, sentence):
        # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
        '''april the 26th, 1882 is the birth date of which athletes ?
        [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'),
        ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'),
        ('athletes', 'O'), ('?', 'O')]'''
        sequence_ner_tuple_list = self.ner_parser.tag(sentence.split())
        sequence_ner_list = []
        for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list):
            sequence_ner_list.append(ner_tag)
        return sequence_ner_list

    def get_toknizer(self, sentence):
        return list(self.parser.tokenize(sentence))

    def find_phrases(self, tree, phrase_tag='NP'):
        return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
Esempio n. 2
0
class CNLP:
    CNLPServerURL = 'http://localhost:9000'

    def __init__(self):
        self.parser = CoreNLPParser(url=self.CNLPServerURL)
        self.dep_parser = CoreNLPDependencyParser(url=self.CNLPServerURL)
        self.ner_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='ner')
        self.pos_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='pos')

    def getParse(self, sentence):
        if (type(sentence) == list):
            return self.parser.parse(sentence)
        else:
            return self.parser.raw_parse(sentence)

    def getDepParse(self, sentence):
        if (type(sentence) == list):
            return self.dep_parser.parse(sentence)
        else:
            return self.dep_parser.raw_parse(sentence)

    def getNERTags(self, sentence):
        if (type(sentence) != list):
            sentence = sentence.split()
        return self.ner_tagger.tag(sentence)

    def getPOSTags(self, sentence):
        if (type(sentence) == list):
            return self.pos_tagger.parse(sentence)
        else:
            return self.pos_tagger.raw_parse(sentence)
Esempio n. 3
0
def getNERs(ws):
    from nltk.parse.corenlp import CoreNLPParser
    from textcrafts.corenlp_api import parserURL
    parser = CoreNLPParser(url=parserURL, tagtype='ner')
    ts = parser.tag(ws)
    for t in ts:
        if t[1] != 'O':
            yield t
Esempio n. 4
0
def get_postagger_for_criterion(criterion):
    #ini_path = "/stanford/postagger"
    #os.environ['STANFORD_PARSER'] = ini_path
    #os.environ['STANFORD_MODELS'] = ini_path
    #os.environ['CLASSPATH'] = ini_path
    
    st = CoreNLPParser(url=os.environ['STANFORD_NLP_TOOLS'], tagtype='pos')
    postagger_list = st.tag(criterion)
    return postagger_list
Esempio n. 5
0
class Lex_parser:
    def __init__(self, tag_id_initialized=False, tag_id=None, uncased=True):
        self.uncased = uncased
        self.tag_id_initialized = tag_id_initialized
        if tag_id_initialized:
            self.tag_to_id = tag_id
        else:
            self.tag_to_id = {"CLSSEP": 0, "UNKNOWN": 1}
        self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
        self.basic_tokenizer = BasicTokenizer()

    def tokenize(self, sentence):
        return list(self.parser.tokenize(sentence))

    def convert_sentence_to_tags(self, sentence: Union[str, list]):
        if type(sentence) == str:
            if self.uncased:
                sentence = sentence.lower()

        else:
            sentence = " ".join(sentence)
            if self.uncased:
                sentence = sentence.lower()

        sentence = self.basic_tokenizer.tokenize(sentence)

        # print("sentence here,", sentence)
        sentence = list(map(lambda x: x.upper() if x == 'i' else x, sentence))
        tags = self.parser.tag(sentence)
        # print("sentence here,", sentence)
        # print("tags here", tags)
        # exit(-2)
        if not self.tag_id_initialized:
            for tag in tags:
                if tag[1] not in self.tag_to_id:
                    self.tag_to_id[tag[1]] = len(self.tag_to_id)
        return tags

    def convert_tags_to_ids(self, tags):
        res = list(map(lambda x: self.tag_to_id[x[1]], tags))
        # print("to ids ==")
        # print(len(tags), tags)
        # print(len(res), res)
        return res

    def convert_sentence_to_ids(self, sentence: Union[str, list]):
        if not self.parser:
            self.parser = CoreNLPParser(url='http://localhost:9000',
                                        tagtype='pos')

        tags = self.convert_sentence_to_tags(sentence)
        ids = self.convert_tags_to_ids(tags)
        print(type(sentence), len(sentence), len(tags), len(ids))
        return list(ids)
Esempio n. 6
0
def get_entity_of_sentence(sentence):
    ner_tagger = CoreNLPParser(url='http://0.0.0.0:9000', tagtype='ner')
    entity_list = ner_tagger.tag(sentence.split())
    return entity_list
class TrueFalseQuestions:

    def __init__(self, filename, port):
        self.filename = filename
        self.port = port
        self.prepare_similars()
        pass

    def prepare_similars(self):
        with open(self.filename, 'r') as f:
            text = f.read()

        self.parser = CoreNLPParser('http://localhost:' + str(self.port), tagtype='ner')
        tokens = text.split(' ')
        ner_tagged = self.parser.tag(tokens)

        self.all_ner_tags = {}
        last = 'O'
        sent = ''
        for w, tag in ner_tagged:
            if tag == 'O':
                if last != 'O' and len(sent) > 0:
                    if last in self.all_ner_tags.keys():
                        self.all_ner_tags[last].append(sent)
                    else:
                        self.all_ner_tags[last] = [sent]
                sent = ''
                continue
            if tag == last:
                if len(sent) > 0:
                    sent += ' ' + w
                else:
                    sent += w
            else:
                if last != 'O' and len(sent) > 0:
                    if last in self.all_ner_tags.keys():
                        self.all_ner_tags[last].append(sent)
                    else:
                        self.all_ner_tags[last] = [sent]
                sent = w
            last = tag

        for key, li in self.all_ner_tags.items():
            stt = set(li)
            li = []
            for word in stt:
                li.append(word)
            self.all_ner_tags[key] = li

    def get_false_sentence(self, sentence):
        ner_tag = self.parser.tag(sentence.split(' '))
        last = 'O'
        tagged_sentence = []
        sent = ''
        index = 0
        for w, tag in ner_tag:
            index +=1
            if tag == last:
                if sent:
                    sent += ' ' + w
                else:
                    sent += w
            else:
                if last != 'O':
                    tagged_sentence.append((sent, last))
                sent = w
            if index == len(ner_tag) and tag!='O':
                tagged_sentence.append((sent,tag))

            last = tag

        candidate_wrong = []
        for gap , tag in tagged_sentence:
            if tag not in self.all_ner_tags.keys():
                continue
            length = len(self.all_ner_tags[tag])
            if length < 2:
                continue
            while True:
                rep = self.all_ner_tags[tag][randint(0,length-1)]
                if rep != gap:
                    break
            candidate_wrong.append(sentence.replace(gap , rep))

        return candidate_wrong
Esempio n. 8
0
def extract_title_keywords():

    t1 = time.time()

    titles = open('titles_raw.txt', "r")

    # global ne_key
    global ne_value
    global feature_cols
    global stem_dict

    # ne_key = []
    # ne_value = ['LOCATION', 'PERSON', 'ORGANIZATION', 'MISC']

    prettt = [[], []]

    st = StanfordNERTagger(
        '/Users/yixuancui/Downloads/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        '/Users/yixuancui/Downloads/stanford-ner-2018-10-16/stanford-ner.jar',
        encoding='utf-8')

    titles_c_list = []
    for title in titles:
        titles_c_list.append(title)
    print("Length of titles_a_list: ", len(titles_c_list))

    label_list = []
    counter = 0

    titles_d_list = []
    titles_ner1_list = []
    titles_ner2_list = []
    words_in_between = []
    v_list = []
    n_list = []
    r_list = []
    j_list = []
    d_list = []
    i_list = []
    stem_list = []

    for c in titles_c_list:
        text = c
        if not text:
            continue

        print(counter)
        # label = input(c)
        # label_list.append(int(label))
        label_list = [1] * 50 + [2] * 50 + [3] * 50 + [4] * 50 + [5] * 50 + [
            6
        ] * 50

        v_count = 0
        n_count = 0
        r_count = 0
        j_count = 0
        d_count = 0
        i_count = 0
        stem_check = 0
        # todo 这里已经把index提取出来了,接下来可以把index1之前,1和2之间,2之后的所有名词和动词提取出来,加pos,stemmer怎么用再说吧我也不知道
        tokenized_text = word_tokenize(text)
        classified_text = st.tag(tokenized_text)
        pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
        pos = list(pos_tagger.tag(tokenized_text))
        ss = SnowballStemmer("english")

        index = []
        for i in range(len(classified_text)):
            word = classified_text[i][0]
            tag = classified_text[i][1]
            if tag != 'O':
                # print('NE: ', word, ', Tag: ', tag, ', Index: ', i)
                # if word.lower() not in ne_key:
                #     ne_key.append(word.lower())
                index.append(i)
        pre_oh_classified_text_0 = [
            classified_text[i][0].lower() for i in index
        ]
        pre_oh_classified_text_1 = [classified_text[i][1] for i in index]
        if not pre_oh_classified_text_0:
            pre_oh_classified_text_0 = ['None']
        if not pre_oh_classified_text_1:
            pre_oh_classified_text_1 = ['None']
        prettt[0].append(pre_oh_classified_text_0)
        prettt[1].append(pre_oh_classified_text_1)
        counter += 1

        # for i in range(len(classified_text)):
        #     if ss.stem(word) in ['acquir', 'buy', 'purchas', 'acquisit']:
        #         stem_check = 1
        #         break
        for i in range(len(classified_text)):
            if pos[i][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                v_count += 1
                word_stem = ss.stem(pos[i][0].lower())
                stem_list.append(word_stem)
                if word_stem not in stem_dict:
                    stem_dict.append(word_stem)
            if pos[i][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
                n_count += 1
                # word_stem = ss.stem(pos[i][0].lower())
                # stem_list.append(word_stem)
                # if word_stem not in stem_dict:
                #     stem_dict.append(word_stem)
            if pos[i][1] in ['RB', 'RBR', 'RBS', 'RP']:
                r_count += 1
            if pos[i][1] in ['JJ', 'JJR', 'JJS']:
                j_count += 1
            if pos[i][1] in ['DT']:
                d_count += 1
            if pos[i][1] in ['IN']:
                i_count += 1
        v_list.append(v_count)
        n_list.append(n_count)
        r_list.append(r_count)
        j_list.append(j_count)
        d_list.append(d_count)
        i_list.append(i_count)
        # stem_list.append(stem_check)

    # enc1 = OneHotEncoder(handle_unknown='ignore')
    # enc1.fit(array(ne_key).reshape(-1, 1))
    enc2 = OneHotEncoder(handle_unknown='ignore')
    enc2.fit(array(ne_value).reshape(-1, 1))
    enc3 = OneHotEncoder(handle_unknown='ignore')
    enc3.fit(array(stem_dict).reshape(-1, 1))

    onehot_list = []

    for j in range(len(titles_c_list)):
        # onehot_ner1 = enc1.transform(array(prettt[0][j]).reshape(-1, 1)).toarray().tolist()
        # nekeys = [sum(i) for i in zip(*onehot_ner1)]
        onehot_ner2 = enc2.transform(array(prettt[1][j]).reshape(
            -1, 1)).toarray().tolist()
        nevalues = [sum(i) for i in zip(*onehot_ner2)]
        onehot_ner3 = enc3.transform(array(stem_list[j]).reshape(
            -1, 1)).toarray().tolist()
        stemlists = [sum(i) for i in zip(*onehot_ner3)]
        # join = nekeys + nevalues + stemlists
        join = nevalues + stemlists
        onehot_list.append(join)

    # header_list = ne_key + ne_value + ["v_list", "n_list", "r_list", "j_list", "d_list", "i_list", "stem_list", "label"]
    # feature_cols = ne_key + ne_value + ["v_list", "n_list", "r_list", "j_list", "d_list", "i_list", "stem_list"]
    header_list = ne_value + stem_dict + [
        "v_list", "n_list", "r_list", "j_list", "d_list", "i_list", "label"
    ]
    feature_cols = ne_value + stem_dict + [
        "v_list", "n_list", "r_list", "j_list", "d_list", "i_list"
    ]

    ll = [
        onehot_list[i] + [v_list[i]] + [n_list[i]] + [r_list[i]] +
        [j_list[i]] + [d_list[i]] + [i_list[i]] + [label_list[i]]
        for i in range(len(onehot_list))
    ]
    # ll = [onehot_list[i] + [v_list[i]] + [n_list[i]] + [r_list[i]] + [j_list[i]]
    #       + [d_list[i]] + [i_list[i]] + [stem_list[i]] + [label_list[i]] for i in range(len(onehot_list))]
    # ll = [onehot_list[i] + [v_list[i]] + [n_list[i]] + [r_list[i]] + [j_list[i]]
    #       + [d_list[i]] + [i_list[i]] + [stem_list[i]] for i in range(len(onehot_list))]

    with open('DT2.csv', mode='w') as DT:
        dt_writer = csv.writer(DT,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        dt_writer.writerow(header_list)
        dt_writer.writerows(ll)

    with open('DT2.csv') as DT:
        dt_writer = csv.reader(DT, delimiter=',')
        counter = 0
        for row in dt_writer:
            counter += 1
            # print(row)
        print(counter)

    del titles_c_list
    del titles_d_list
    # del onehot_ner1
    del onehot_ner2

    # with open('ne_key.data', 'wb') as filehandle:
    #     # store the data as binary data stream
    #     pickle.dump(ne_key, filehandle)

    t2 = time.time()
    print(t2 - t1)
    print(ne_key)
Esempio n. 9
0
    if person != "":
        #wh_query=person
        if 'actor' in from_clause:
            wh_query += " and p.id=a.actor_id"
        elif 'director' in from_clause:
            wh_query += " and p.id=d.director_id"

    if 'oscar' in from_clause and 'movie' in from_clause:
        wh_query += " and m.id=o.movie_id "

    if 'oscar' in from_clause and 'person' in from_clause:
        wh_query += " o.person_id=p.id  and " + osc

    if 'movie' in from_clause and 'actor' in from_clause:
        wh_query += " a.movie_id=m.id"

    query = select_clause + " from " + from_clause[1:] + " where " + wh_query
    print(query)
    return query


qstn = "Which album by Swift was released in 2012?"
parser = CoreNLPDependencyParser()

depparse = next(parser.raw_parse(qstn))
for l in list(depparse.triples()):
    print(l)
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
ner_tag = ner_tagger.tag(qstn.split())
generate_query(depparse, ner_tag, 'S')