Python CoreNLPPOSTagger Exemples, nltk.tag.stanford.CoreNLPPOSTagger Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : utils.py Projet : ppljs/TDP_NLP

def tag_tokens_using_stanford_corenlp(token_list, corenlp_server_address='http://localhost:9000'):
    # print("tag_tokens_using_stanford_corenlp started")

    tagger = CoreNLPPOSTagger(url=corenlp_server_address)

    # The piece of code below is exists to deal with a limitation of the Stanford's coreNLP Server that only
    # supports 100000 characters per server call. So this will break the text in a lot of smaller pieces and send
    # them to the server and after will unite them all in one list of tagged words ('tagged_text')
    tagged_text = []
    txt_size = len(token_list)
    i = 0
    while i < txt_size:

        if i + 6000 >= txt_size:
            tokens_to_tag = token_list[i:txt_size]
            i = txt_size
        else:
            tokens_to_tag = token_list[i:i + 6000]
            i += 6000

        tagged_text += tagger.tag(tokens_to_tag)

    # print("tag_tokens_using_stanford_corenlp ended")

    return tagged_text

Exemple #2

0

Afficher le fichier

Fichier : stanford_pos.py Projet : nguyenlab/claim_segmentation_tagger

    def __init__(self, config_path=DEFAULT_CONFIG_PATH):
        self.config = load_config(config_path)
        corenlp_config = self.config["data"]["stanford_corenlp"]
        self.tagger = CoreNLPPOSTagger(
            url="http://%s:%d" %
            (corenlp_config["host"], corenlp_config["port"]))

        self.pos_map = self.config["model"]["STANFORD_POS_MAP"]

Exemple #3

0

Afficher le fichier

Fichier : evaluation_usefulness.py Projet : freddiewanah/diffSimilarTech

def get_pos_tag(techs, words):
    """ Get POS tag of words.

        ([str], [str]) -> ([str], [str])
    """
    tags = []
    flag = False
    tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words)
    if len(words) != len(tagged_words):
        tagged_words = pos_tag(words)
    words = []
    for (word, tag) in tagged_words:
        if flag:
            word = "." + word
            flag = False
        if tag == "IN" and word in cin:
            tags.append("CIN")
        elif word in cv:
            tags.append("CV")
        elif word in techs:
            tags.append("TECH")
        elif word == ".":
            flag = True
            continue
        elif tag[:2] == "VB":
            tags.append("VB")
        else:
            tags.append(tag)
        words.append(word)
    return (words, tags)

Exemple #4

0

Afficher le fichier

def count(file_name):
    with open(os.path.join(os.path.pardir, "out", "tech_v6",
                           file_name)) as data3_file:
        num = 0
        for line in data3_file:
            if num % 4 == 2:
                words = line.split(" ")
                words[-1] = words[-1].strip()
                for (word, tag) in CoreNLPPOSTagger(
                        url='http://localhost:9000').tag(words):
                    if word not in stopwords_en and word not in modal_verbs and word not in synonyms:
                        if word in wf:
                            wf[word] += 1
                        else:
                            wf[word] = 1
                        if tag[:2] == "JJ":
                            if word in jj:
                                jj[word] += 1
                            else:
                                jj[word] = 1
                        elif tag[:2] == "NN":
                            if word in nn:
                                nn[word] += 1
                            else:
                                nn[word] = 1
                        elif tag[:2] == "RB":
                            if word in rb:
                                rb[word] += 1
                            else:
                                rb[word] = 1
            num += 1

Exemple #5

0

Afficher le fichier

def classify(no):
    num = 0
    compa_sent_count = 0
    current_id = 0
    try:
        nlp = spacy.load('en')
        matcher = Matcher(nlp.vocab)
        add_patterns(matcher)
        # with io.open(os.path.join(os.pardir, "out", "tech_v5", "{}.txt".format(no)), "r", encoding="utf-8") as data_file:
        with open(os.path.join(os.pardir, "out", "tech_v6", "{}.txt".format(no))) as data_file:
            compa_sent_count = 0
            for line in data_file:
                if num % 4 == 0:
                    current_id = line
                elif num % 4 == 1:
                    tech_pair = line.split("\t")
                    tech_pair[-1] = tech_pair[-1].strip()
                elif num % 4 == 2:
                    tag_list = []
                    # for token in doc:
                    #     tag = token.tag_
                    #     word = token.text
                    # print(line)
                    flag = False
                    for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split(" ")):
                    # for (word, tag) in nltk.pos_tag(line.split(" ")):
                        if flag:
                            word = "." + word
                            flag = False
                        if tag == "IN" and word in cin:
                            tag_list.append("CIN")
                        elif word in cv:
                            tag_list.append("CV")
                        elif word in tech_pair:
                            tag_list.append("TECH")
                        elif word == ".":
                            flag = True
                        else:
                            tag_list.append(tag)
                    pos_tag = " ".join(tag_list)
                    patterns = matcher(nlp(pos_tag))
                    if patterns != []:
                        compa_sent_count += 1
                        data_file = open(os.path.join(os.pardir, "out", "tech_v6", "sentences_1.txt"), "a")
                        data_file.write("{}".format(current_id))
                        data_file.write("{}\n".format("\t".join(tech_pair)))
                        for pattern in patterns:
                            data_file.write("pattern"+str(pattern[0])+"\t")
                        data_file.write(str("\n{}\n".format(line)))
                        data_file.close()
                num += 1
    finally:
        print("Proc {}: {}/{} from - to {}".format(os.getpid(), compa_sent_count, num/4, current_id))

Exemple #6

0

Afficher le fichier

Fichier : data_parsing.py Projet : AlexandrosPlessias/QnA_System

def create_parse_trees(sentences):
    """ Create Parse tree for each sentence in sentences list and return all trees in a list.
    Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the CoreNLP Parser.

    :param sentences: Input sentences for parsing.
    :type sentences: list(str)
    :return: list(Tree)
    """

    # Create Stanford Parser.
    stanford_parser = CoreNLPPOSTagger()

    # Create a list to store all sentences parsed trees.
    parsed_sentences_trees = []

    # Create parsed trees ans store to list.
    for sentence in sentences:
        for line in stanford_parser.raw_parse(sentence):
            temp_tree = Tree.fromstring(str(line))
            parsed_sentences_trees.append(temp_tree)

    return parsed_sentences_trees

Exemple #7

0

Afficher le fichier

Fichier : SelectCandidates.py Projet : cnclabs/UGSD

    def __init__(self, src_folder="../data/", freq_thre=100, corenlp_path="../stanford-corenlp/",\
                 ner_path="../stanford-ner/", verbose=False):

        # initialization
        self.src = os.path.join(src_folder, "reviews/")
        self.corenlp_path = os.path.normpath(corenlp_path) + "/"
        self.stanford_ner_path = os.path.normpath(ner_path) + "/"
        self.frequency_threshold = freq_thre
        self.dst = os.path.join(src_folder, "lexicon/candidates.json")
        self.dst_allReviews = os.path.join(src_folder, "allReviews/")
        self.dst_ner_tsv = os.path.join(src_folder, "ner_tsv/")
        self.dst_ne = os.path.join(src_folder, "ne/")
        self.verbose = verbose

        # pick up sentiment words
        self.pos_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"]
        self.pos_tagged_statistics = {}

        # it is based on CoreNLP, a new version of stanford pos tagger
        self.pos_tagger = CoreNLPPOSTagger()
        self.stemmer = SnowballStemmer("english")
        self.stopwords = set(stopwords.words("english"))
        # remove `not` because we need combine `not` and sentiment words
        self.stopwords.remove("not")

Exemple #8

0

Afficher le fichier

    def modify(self):
        url = "http://localhost:9000/tregex"
        request_params = {"pattern": " SBAR|VP|NP=app $, /,/ "}
        # text = "Mexico City, the biggest city in the world, has many interesting archaeological sites."
        text = self.text
        # print(text)
        r = requests.post(url, data=text, params=request_params)
        json_data = json.loads(r.text)
        text1 = json_data['sentences'][0]['0']['match']
        tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
        line = tree.leaves()

        appos = ''
        begin_text = ''
        for tag in line:
            appos = appos + tag+' '

        text = text.replace(',', '')
        result = text.index(appos)
        text = text.replace(appos, '')
        for x in range(0,result):
            begin_text = begin_text + text[x]

        doc = nlp(begin_text)
        for ent in doc.ents:
            sub_ent = ent.label_
        if sub_ent == 'GPE' or sub_ent == 'LOC':
            text = text.replace(begin_text, '')
        text1 = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split())

        for tagg in text1:
            # line = 'She ate the fruits.'
            if tagg[1] == "VBD":
                tense = "was"

            # line = 'We eat the fruits.'
            if tagg[1] == "VBP":
                tense = "is"

            # line = 'She eats the fruits.'
            if tagg[1] == "VBZ":
                tense = "is"

        qts = "Which/Where"
        qts = qts + ' ' + tense + ' ' + appos + '?'
        return qts

Exemple #9

0

Afficher le fichier

Fichier : stanford_pos.py Projet : nguyenlab/claim_segmentation_tagger

class StanfordPOSAnnotator(Annotator):
    def __init__(self, config_path=DEFAULT_CONFIG_PATH):
        self.config = load_config(config_path)
        corenlp_config = self.config["data"]["stanford_corenlp"]
        self.tagger = CoreNLPPOSTagger(
            url="http://%s:%d" %
            (corenlp_config["host"], corenlp_config["port"]))

        self.pos_map = self.config["model"]["STANFORD_POS_MAP"]

    def annotate(self, annotable):
        if (annotable.__class__.__name__ == "Document"):
            return self.annotate_document(annotable)
        elif (annotable.__class__.__name__ == "Sentence"):
            return self.annotate_sentence(annotable)
        else:
            raise AnnotationError(
                "This annotator only accepts Document or Sentence annotables.")

    def annotate_document(self, document):
        for sentence in document.sentences:
            self.annotate_sentence(sentence)

    def annotate_sentence(self, sentence):
        token_list = [token.surface for token in sentence.tokens]
        tagged_tokens = self.tagger.tag(token_list)

        for i in range(len(token_list)):
            sentence.tokens[i].annotations["STANFORD_POS"] = tagged_tokens[i][
                1]

            for pos_rgx in self.pos_map:
                if (re.match(pos_rgx, tagged_tokens[i][1])):
                    sentence.tokens[i].annotations["POS"] = self.pos_map[
                        pos_rgx].split("|")[0]

            if ("POS" not in sentence.tokens[i].annotations):
                sentence.tokens[i].annotations["POS"] = "x"

Exemple #10

0

Afficher le fichier

from projectFiles import utils
from projectFiles.Utils import xlsxUtils

import pandas as pd
import numpy as np
from nltk.tag.stanford import CoreNLPPOSTagger
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

brown_ic = wordnet_ic.ic('ic-brown.dat')
tagger = CoreNLPPOSTagger(url='http://localhost:9000')


def calculate_semantic_sim(word1, word2, pos1='n', pos2='n'):
    list_of_synsets1 = wn.synsets(word1, pos=pos1)
    list_of_synsets2 = wn.synsets(word2, pos=pos2)

    if not list_of_synsets1:
        return 0.0001
    if not list_of_synsets2:
        return 0.0001

    s1 = list_of_synsets1[0]
    s2 = list_of_synsets2[0]

    total_value = 0

    value = s1.wup_similarity(s2)
    value = utils.limit_value(value, 0.0001, 1.0)
    total_value += value

Exemple #11

0

Afficher le fichier

from nltk.tag.stanford import CoreNLPNERTagger, CoreNLPPOSTagger
from nltk.tokenize.stanford import CoreNLPTokenizer

stpos, stner = CoreNLPPOSTagger('http://localhost:9001'), CoreNLPNERTagger(
    'http://localhost:9001')
sttok = CoreNLPTokenizer('http://localhost:9001')

sttok.tokenize(u'你好')

stpos.tag(u'basf')

stpos.tag(sttok.tokenize(u'text'))

stner.tag(u'你好')

stner.tag(sttok.tokenize(u'你好'))

Exemple #12

0

Afficher le fichier

def main():
    information = {}
    sentences = set()
    for items in relations[pair]:
        sentences.add(items[5])
        information[items[5]] = (items[0], items[1], items[2], items[4])
    sentences = list(sentences)
    l = len(sentences)
    corpus = []
    topics = []
    for sentence in sentences:
        if pos_flag:
            words = sentence.split()
            words[-1] = words[-1].strip()
            tagged_words = CoreNLPPOSTagger(
                url='http://localhost:9000').tag(words)
            if len(words) != len(tagged_words):
                tagged_words = pos_tag(words)
            # print(tagged_words)
            # print(sentence.strip())
            for phrase in stop_phrases:
                n = len(phrase)
                for i in range(len(tagged_words) - n + 1):
                    if phrase == words[i:i + n]:
                        for j in range(i, i + n):
                            tagged_words[j] = (None, tagged_words[j][1])
            i = 0
            indices = []
            keywords = []
            for (word, tag) in tagged_words:
                if word in pair:
                    indices.append(i)
                    keywords.append(word)
                    i += 1
                elif word not in stop_words and tag in pos_tag_set and word is not None:
                    keywords.append(word)
                    i += 1
            # topics.append(" ".join(keywords))
            # topics.append(sentence.strip())
            if len(keywords) <= 10 and flag:
                ws = [w for w in keywords if w not in pair]
            else:
                ws = []
                # if len(indices) == 2:
                #     for j in range(len(keywords)):
                #
                #         if j > indices[0] and j <= indices[0] + 4 and keywords[j] not in pair and j < indices[1]:
                #             ws.append(keywords[j])
                #         elif j >= indices[1] - 2 and j <= indices[1] + 2 and keywords[j] not in pair:
                #             ws.append(keywords[j])
                # else:
                if True:
                    for j in range(len(keywords)):
                        for i in indices:
                            if j >= i - 2 and j <= i + 2 and keywords[
                                    j] not in pair and keywords[j] not in ws:
                                ws.append(keywords[j])
                                break
            # with open(keywords_path, "a") as keywords_file:
            #     keywords_file.write(",".join(ws)+"\n")
            #     keywords_file.write(sentence+"\n")
            corpus.append(ws)
            topics.append(" ".join(ws))
        else:
            corpus.append([w for w in sentence.split() if w not in stop_words])

    if query_flag:
        with open(os.path.join(os.pardir, "keywords", "corpus.pkl"),
                  'wb') as corpus_file:
            pickle.dump(corpus, corpus_file)
        with open(os.path.join(os.pardir, "keywords", "sentences.pkl"),
                  'wb') as sentences_file:
            pickle.dump(sentences, sentences_file)

    else:
        # Prepare word2vector model
        fname = os.path.join(os.pardir, "data", "mymodel")
        model = gensim.models.Word2Vec.load(fname)
        model.init_sims(replace=True)

        # Build weighted graph
        # dictionary = Dictionary(corpus)
        # bow_corpus = [dictionary.doc2bow(document) for document in corpus]

        index = WmdSimilarity(corpus, model)
        G = nx.Graph()
        for i in range(l - 1):
            sims = index[corpus[i]]
            # print("query:")
            # print(corpus[i])
            # print(sentences[i])
            # print("sims:")
            for j in range(i + 1, l):
                # print(sims[j])
                # print(corpus[j])
                # print(sentences[j])
                # print()
                shreshold = set_shreshold(len(corpus[i]), len(corpus[j]))
                if sims[j] >= shreshold:
                    if i not in G: G.add_node(i)
                    if j not in G: G.add_node(j)
                    G.add_edge(i, j)
                    # G.add_edge(i, j, weight=sims[j])

        out_path = os.path.join(
            os.pardir, "{}_{}_{}.txt".format("&".join(pair),
                                             G.number_of_nodes(), l))
        # image_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.png".format("&".join(pair), G.number_of_nodes(), l))

        # Draw graph
        pos = nx.spring_layout(G)
        plt.figure(figsize=(19, 12))
        plt.axis('off')
        nx.draw_networkx_nodes(G, pos, node_size=50)
        nx.draw_networkx_edges(G, pos, width=0.75)
    #first compute the best partition
    communities = []
    partition = community.best_partition(G)
    for com in set(partition.values()):
        list_nodes = [
            nodes for nodes in partition.keys() if partition[nodes] == com
        ]
        communities.append(list_nodes)

    num = 0
    graph_indices = set()
    bloblist = []
    clusters = []
    for com in communities:
        if len(com) > 1:
            doc = ""
            for i in com:
                doc += topics[i] + " "
            bloblist.append(tb(doc))
            clusters.append(com)

    aspects[pair] = set()
    new_aspects[pair] = {}
    # if True:
    with open(out_path, "a") as out_file:
        for i, blob in enumerate(bloblist):
            # print("Top words in document {}".format(i + 1))
            scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
            sorted_words = sorted(scores.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
            # word_num = 0
            aspect_keywords = []
            for word, score in sorted_words[:3]:
                out_file.write(word + ", ")
                aspect_keywords.append(word)
            new_aspects[pair][" ".join(aspect_keywords)] = set()
            # for word, score in sorted_words:
            #     if word_num == 3:
            #         break
            #     if tf(word, blob) >= 0.2:
            #         word_num += 1
            #         out_file.write(word+", ")
            #         print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
            out_file.write(
                "---------------------------------------------------\n\n")
            for j in clusters[i]:
                temp = information[sentences[j]]
                new_aspects[pair][" ".join(aspect_keywords)].add(
                    (temp[0], temp[1], temp[2], temp[3], sentences[j]))
                aspects[pair].add(
                    (temp[0], temp[1], temp[2], " ".join(aspect_keywords),
                     temp[3], sentences[j]))
                out_file.write(",".join(corpus[j]) + "\n")
                out_file.write(sentences[j] + "\n")
                graph_indices.add(j)
            num += 1
        out_file.write(
            "other---------------------------------------------------\n\n")
        new_aspects[pair]["other"] = set()
        for j in range(len(sentences)):
            if j not in graph_indices:
                temp = information[sentences[j]]
                new_aspects[pair]["other"].add(
                    (temp[0], temp[1], temp[2], temp[3], sentences[j]))
                aspects[pair].add(
                    (temp[0], temp[1], temp[2], "", temp[3], sentences[j]))

                out_file.write(",".join(corpus[j]) + "\n")
                out_file.write(sentences[j] + "\n")
    plt.close('all')

Exemple #13

0

Afficher le fichier

Fichier : interpreter.py Projet : RosZki/CAILS

import nltk
from nltk.parse.corenlp import CoreNLPParser
from nltk.tag.stanford import CoreNLPPOSTagger
from pycorenlp import StanfordCoreNLP

from brain import memory
from brain.conjugator import conjugator

DEFAULT_TAGS = ['NNP', 'NNPS', 'NN', 'NNS']
KEYWORD_TAGS = DEFAULT_TAGS[:]
KEYWORD_TAGS.extend(['VBG'])
#KEYWORD_TAGS.extend(['VBG', 'PRP', 'PRP$', 'WP', 'WP$', 'WRB', 'WDT'])

STANFORD_TAGGER = CoreNLPPOSTagger('http://localhost:9000/')
STANFORD_SERVER = StanfordCoreNLP('http://localhost:9000/')
STANFORD_PARSER = CoreNLPParser('http://localhost:9000/')


def combine_similar(input, tags):
    output = []
    curr = []
    tag = ""
    for x in input:
        if x[1] not in tags:
            if len(curr) > 0:
                output.append((" ".join([x[0] for x in curr]), tag))
                curr[:] = []
                tag = ""
            output.append(x)
        elif x[1] == tag:
            curr.append(x)

Exemple #14

0

Afficher le fichier

    def modify(self):
        url = "http://localhost:9000/tregex"
        request_params = {
            "pattern":
            " RB=n1  > (ADVP >> (S=n2 > ROOT)) | > (ADJP >> (S=n2 > ROOT))"
        }
        text = self.text
        # print(text)
        r = requests.post(url, data=text, params=request_params)
        json_data = json.loads(r.text)
        text1 = json_data['sentences'][0]['0']['match']
        tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
        str1 = tree.leaves()
        adverb = ' '.join(str1)

        text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(
            text.split())
        #print(text_pos)
        c = 0
        for tagg in text_pos:
            if (c == 0 and tagg[1] != "NNP" and tagg[0] != 'I'):
                s = tagg[0].lower()
                text = text.replace(tagg[0], s)
            #line = 'He ran quickly.'
            if tagg[1] == "VBD" and text_pos[c][0] != 'had' and text_pos[
                    c + 1][1] != 'VBG':
                verb_tense = "did"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
                text = text.replace(tagg[0], root_verb)

            #line = 'I run quickly.'
            if tagg[1] == "VBP" and text_pos[c][0] != 'is' and text_pos[c][
                    0] != 'are' and text_pos[c][0] != 'have':
                verb_tense = "do"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
                text = text.replace(tagg[0], root_verb)

            #line = 'John runs quickly.'
            if tagg[1] == "VBZ" and text_pos[c + 1][1] != 'VBN' and text_pos[
                    c + 1][1] != 'VBG':
                verb_tense = "does"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
                text = text.replace(tagg[0], root_verb)

            #line = 'John is playing quietly.'
            #line = 'John was playing quietly.'
            #line = 'John is going to play quietly.'

            if tagg[1] == "VBG" and text_pos[c - 1][1] != 'VB' and text_pos[
                    c - 1][1] != 'VBN':
                verb_tense = text_pos[c - 1][0]
                text = text.replace(text_pos[c - 1][0] + " ", "")

            #line = 'John has ran quickly.'
            if tagg[1] == "VBZ" and text_pos[
                    c + 1][1] == 'VBN' and text_pos[c + 2][1] != "VBG":
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0] + " ", "")

            #line = 'John will be playing quietly.'
            if tagg[1] == "VBG" and text_pos[c - 1][1] == 'VB':
                verb_tense = text_pos[c - 2][0]
                text = text.replace(text_pos[c - 2][0] + " ", "")

            #line = 'John has been playing quietly.'
            #line = 'John had been playing quietly.'
            if (tagg[1] == "VBZ" or tagg[1] == "VBD") and text_pos[
                    c + 1][1] == 'VBN' and text_pos[c + 2][1] == 'VBG':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0] + " ", "")

            #line = 'John had left quietly.'
            #line = 'We have eaten the meal quietly.'
            if tagg[1] == "VBN" and tagg[0] != 'been' and (
                    text_pos[c - 1][0] == 'had' or text_pos[c - 1][0]
                    == 'have') and text_pos[c - 2][1] != 'MD':
                verb_tense = text_pos[c - 1][0]
                text = text.replace(text_pos[c - 1][0] + " ", "")

            #line = 'John will run quickly.'
            #line = 'John would have ran quickly.'
            if tagg[1] == "MD" and text_pos[c + 1][1] == 'VB':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0] + " ", "")
            c = c + 1
        """
        obj=""
        for i in line:
            classified_text = st.tag(word_tokenize(i))
            if classified_text[0][1]!='PERSON':
                break
            obj = obj + classified_text[0][0]+ " "
        """
        text = text.replace(".", " ?")
        text = text.replace(adverb, "")
        Q = 'How ' + verb_tense + ' ' + text
        return Q

Exemple #15

0

Afficher le fichier

Fichier : SelectCandidates.py Projet : cnclabs/UGSD

class SelectCandidates:
    """ This program aims to select candidate words from reviews
        We picks up sentiment words and handles the negation problem
        The result will be stored in `src_folder/lexicon/candidates.json`
    """

    def __init__(self, src_folder="../data/", freq_thre=100, corenlp_path="../stanford-corenlp/",\
                 ner_path="../stanford-ner/", verbose=False):

        # initialization
        self.src = os.path.join(src_folder, "reviews/")
        self.corenlp_path = os.path.normpath(corenlp_path) + "/"
        self.stanford_ner_path = os.path.normpath(ner_path) + "/"
        self.frequency_threshold = freq_thre
        self.dst = os.path.join(src_folder, "lexicon/candidates.json")
        self.dst_allReviews = os.path.join(src_folder, "allReviews/")
        self.dst_ner_tsv = os.path.join(src_folder, "ner_tsv/")
        self.dst_ne = os.path.join(src_folder, "ne/")
        self.verbose = verbose

        # pick up sentiment words
        self.pos_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"]
        self.pos_tagged_statistics = {}

        # it is based on CoreNLP, a new version of stanford pos tagger
        self.pos_tagger = CoreNLPPOSTagger()
        self.stemmer = SnowballStemmer("english")
        self.stopwords = set(stopwords.words("english"))
        # remove `not` because we need combine `not` and sentiment words
        self.stopwords.remove("not")

    def stanford_ner(self):
        """ call stanford java ner api """

        self.merge_reviews()
        self.run_ner()
        self.find_named_entity()

    def merge_reviews(self):
        """ merge all reviews for named entity recognition """

        if self.verbose:
            print "Merging all reviews for named entity recognition" + "\n" + "-" * 80

        self.create_dir(self.dst_allReviews)

        for dirpath, dirs, files in os.walk(self.src):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json",
                    f).group(1)
                data = json.load(open(os.path.join(dirpath, f)))
                with open(os.path.join(self.dst_allReviews, filename + ".txt"),
                          "w+") as rf:
                    for r in data["reviews"]:

                        text = r["review"]
                        # remove accents
                        text = unicodedata.normalize("NFKD", text).encode(
                            "ASCII", "ignore")
                        # remove all website urls written in the review
                        text = re.sub(r"https?:\/\/.*[\r\n]*",
                                      " ",
                                      text,
                                      flags=re.MULTILINE)
                        # remove non english letters or words and numbers
                        text = re.sub(
                            r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "",
                            text)
                        # remove extra nextline
                        text = re.sub("(\\n)+", r" ", text)

                        # I'm -> I am
                        text = re.sub(r"'m ", " am ", text)
                        text = re.sub(r"'re ", " are ", text)
                        text = re.sub(r"'s ", " is ", text)
                        text = re.sub(r"'ve ", " have ", text)
                        text = re.sub(r"'d ", " would ", text)
                        text = re.sub(r" won't ", " will not ", text)
                        text = re.sub(r"n't ", " not ", text)
                        text = re.sub(r"'ll ", " will ", text)

                        # remove all punctuations except for , . ? ! ; : and -
                        # -: composite adj.
                        text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text)

                        # Space out every sign & symbol & punctuation
                        text = re.sub("([^\w\s])", r" \1 ", text)

                        text = text.replace("\'", "")
                        # remove ` - `, ` -`, `- `
                        text = re.sub(r"(\-)+", "-", text)
                        text = re.sub(
                            r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ",
                            text)
                        # turn multiple spaces into one
                        text = re.sub(r"(\s)+", " ", text)
                        # remove extra space at both ends of the text
                        text = text.strip()

                        rf.write(text)
                        rf.write("\n\n. CHANGE-REVIEW .\n\n")

    def run_ner(self):
        """ run shell to call NER """

        if self.verbose:
            print "Running shell to call Stanford NER" + "\n" + "-" * 80

        self.create_dir(self.dst_ner_tsv)

        comm = "java -mx1g -cp \"%s*:%slib/*\" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier %sclassifiers/english.all.3class.distsim.crf.ser.gz -outputFormat tabbedEntities -textFile %s > %s"
        for dirpath, dirs, files in os.walk(self.dst_allReviews):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).txt",
                    f).group(1)
                src_file = os.path.join(dirpath, f)
                dst_file = os.path.join(self.dst_ner_tsv, filename + ".tsv")
                command = comm % (self.stanford_ner_path,
                                  self.stanford_ner_path,
                                  self.stanford_ner_path, src_file, dst_file)
                subprocess.call(command, shell=True)

    def find_named_entity(self):
        """ find named entity from the ner tsv """

        if self.verbose:
            print "Finding named entity from ner tsv files" + "\n" + "-" * 80

        self.create_dir(self.dst_ne)

        for dirpath, dirs, files in os.walk(self.dst_ner_tsv):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).tsv",
                    f).group(1)
                src_file = os.path.join(dirpath, f)
                dst_file = os.path.join(self.dst_ne, filename + ".txt")
                rs = [set()]

                with open(src_file, "rb") as tsvin:
                    data = csv.reader(tsvin, delimiter="\t")
                    for r in data:
                        if len(r) != 0 and r[0] != "":
                            if r[1] == "ORGANIZATION" or r[1] == "PERSON" or r[
                                    1] == "LOCATION":
                                l = r[0].split(" ")
                                for i in l:
                                    if (i, r[1]) not in rs:
                                        rs[-1].add((i, r[1]))
                        elif len(r) > 2 and "CHANGE-REVIEW" in r[2]:
                            rs.append(set())

                with open(dst_file, "w+") as rf:
                    for rs_index in range(len(rs) - 1):
                        rf.write(str(rs_index) + ",FILEINDEX\n")
                        for i in rs[rs_index]:
                            rf.write(i[0] + "," + i[1] + "\n")

    def get_sentiment_words(self):
        """ load all reviews in src folder: data/reviews/ and merge them """

        # start Stanford CoreNLP server in a new process
        comm = "java -mx4g -cp \"%s*\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 50000"
        command = comm % (self.corenlp_path)
        proc = subprocess.Popen(command, shell=True, preexec_fn=os.setsid)
        time.sleep(10)  # wait for starting Stanford CoreNLP server

        for dirpath, dir_list, file_list in os.walk(self.src):
            if self.verbose:
                print "Walking into directory: " + str(dirpath)

            if len(file_list) > 0:
                for f in file_list:
                    # in case there is a goddamn .DS_Store file
                    if str(f) == ".DS_Store":
                        if self.verbose:
                            print "Removing " + dirpath + "/" + str(f)
                        os.remove(os.path.join(dirpath, f))
                    else:
                        with open(os.path.join(dirpath, f)) as fp:
                            entity = json.load(fp)

                    if self.verbose:
                        print "Processing " + "\033[1m" + entity[
                            "entity"] + "\033[0m" + " in " + "\033[1m" + entity[
                                "category"] + "\033[0m"
                    self.analyze_part_of_speech(entity["reviews"], f)
            else:
                if self.verbose:
                    print "No file is found in " + str(dirpath)

        os.killpg(os.getpgid(proc.pid), signal.SIGTERM)

        if self.verbose:
            print "Part of Speech Analysis on Reviews are Done"
            print "-" * 80

    def analyze_part_of_speech(self, reviews, filename):
        """ run nltk.pos_tag to analysis the part_of_speech of every word """

        ner_set = self.load_ner_tags(filename)

        for review_index in range(len(reviews)):

            text = reviews[review_index]["review"]
            # remove accents
            text = unicodedata.normalize("NFKD",
                                         text).encode("ASCII", "ignore")
            # remove all website urls written in the review
            text = re.sub(r"https?:\/\/.*[\r\n]*",
                          " ",
                          text,
                          flags=re.MULTILINE)
            # remove non english letters or words and numbers
            text = re.sub(r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "",
                          text)
            # remove extra nextline
            text = re.sub("(\\n)+", r" ", text)

            # I'm -> I am
            text = re.sub(r"'m ", " am ", text)
            text = re.sub(r"'re ", " are ", text)
            text = re.sub(r"'s ", " is ", text)
            text = re.sub(r"'ve ", " have ", text)
            text = re.sub(r"'d ", " would ", text)
            text = re.sub(r" won't ", " will not ", text)
            text = re.sub(r"n't ", " not ", text)
            text = re.sub(r"'ll ", " will ", text)

            # remove all punctuations except for , . ? ! ; : and -
            # -: composite adj.
            text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text)

            # space out every sign & symbol & punctuation
            text = re.sub("([^\w\s])", r" \1 ", text)

            text = text.replace("\'", "")
            # remove ` - `, ` -`, `- `
            text = re.sub(r"(\-)+", "-", text)
            text = re.sub(r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ",
                          text)
            # turn multiple spaces into one
            text = re.sub(r"(\s)+", " ", text)
            # remove extra space at both ends of the text
            text = text.strip()

            # tokenize
            tokenized_text = text.split(" ")
            # remove empty string
            tokenized_text = [w for w in tokenized_text if w]

            # pos tag
            # a list of word tuples # [("great", "JJ"), ("tour", "NN") ...]
            if len(tokenized_text) == 0:
                continue
            word_tuple_list = self.pos_tagger.tag(tokenized_text)

            # remove stop_words
            word_tuple_list = [(w[0].lower(), w[1]) for w in word_tuple_list
                               if w[0].lower() not in self.stopwords]
            # remove empty string
            word_tuple_list = [(w[0], w[1]) for w in word_tuple_list if w[0]]

            combine_or_not = False
            combination_front = ""
            for word_tuple in word_tuple_list:
                # putting them into dictionary
                # add 1 to value if exist
                # add key and value if not
                if word_tuple[1] not in self.pos_tags:
                    if combine_or_not:
                        if combination_front in self.pos_tagged_statistics:
                            self.pos_tagged_statistics[combination_front] += 1
                        else:
                            self.pos_tagged_statistics[combination_front] = 1
                        combine_or_not = False
                        combination_front = ""
                elif word_tuple[0] not in ner_set[review_index]:
                    if combine_or_not:
                        if combination_front:
                            combination_front += "_" + word_tuple[0]
                        else:
                            combination_front = word_tuple[0]
                    else:
                        combine_or_not = True
                        combination_front = word_tuple[0]
            if combine_or_not:
                if combination_front in self.pos_tagged_statistics:
                    self.pos_tagged_statistics[combination_front] += 1
                else:
                    self.pos_tagged_statistics[combination_front] = 1

    def stem(self, candidate_lexicon):
        """ perform stemming on candidate lexicon | candidate lexicon should be a list """

        stemmed_lexicon = []
        for word in candidate_lexicon:
            stemmed_word = self.stemmer.stem(word)
            stemmed_lexicon.append({
                "word": word,
                "stemmed_word": stemmed_word
            })
        stemmed_lexicon = sorted(stemmed_lexicon, key=lambda k: k['word'])

        if self.verbose:
            print "\nMerging stemmed duplicates"
        processed_lexicon = {}
        length = len(stemmed_lexicon)
        cnt = 0
        for word_dict in stemmed_lexicon:
            cnt += 1
            if word_dict["stemmed_word"] not in processed_lexicon:
                processed_lexicon[word_dict["stemmed_word"]] = [
                    word_dict["word"]
                ]
            else:
                processed_lexicon[word_dict["stemmed_word"]].append(
                    word_dict["word"])
            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (cnt, length))
                sys.stdout.flush()

        processed_lexicon = [{
            "stemmed_word": key,
            "word": value
        } for key, value in processed_lexicon.iteritems()]
        # sorting dictionaries by word
        processed_lexicon = sorted(processed_lexicon,
                                   key=lambda k: k["stemmed_word"])

        return processed_lexicon

    def load_ner_tags(self, filename):
        """ load named entity for files """

        filename = re.search(
            "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json",
            filename).group(1)
        ner_set = []
        with open(os.path.join(self.dst_ne, filename + ".txt"), "rb") as ne_f:
            tags = csv.reader(ne_f, delimiter=",")
            for tag in tags:
                if tag[1] == "FILEINDEX":
                    ner_set.append(set())
                else:
                    ner_set[-1].add(tag[0].lower())
        return ner_set

    def render_candidate_lexicon(self):
        """ render the candidate words """

        # filtered by self.frequency_threshold
        if self.verbose:
            print "Filtering out frequency lower than frequency_threshold" + "\n" + "-" * 80

        self.create_dir(self.dst)

        pos_tagged_words = []
        pos_tagged_words_under_thre = []
        for key in self.pos_tagged_statistics:
            if self.pos_tagged_statistics[key] > self.frequency_threshold:
                pos_tagged_words.append(key)
            else:
                pos_tagged_words_under_thre.append(key)

        if self.verbose:
            print "Stemming candidate words"
        pos_tagged_words = self.stem(pos_tagged_words)
        pos_tagged_words_under_thre = self.stem(pos_tagged_words_under_thre)

        ordered_dict_list = [[], []]
        if self.verbose:
            print "\nOrganizing candidate words"
        length = len(pos_tagged_words)
        for index in range(len(pos_tagged_words)):
            ordered_dict = OrderedDict()
            ordered_dict["index"] = index + 1
            ordered_dict["count"] = sum([
                self.pos_tagged_statistics[w]
                for w in pos_tagged_words[index]["word"]
            ])
            ordered_dict["stemmed_word"] = pos_tagged_words[index][
                "stemmed_word"]
            ordered_dict["word"] = pos_tagged_words[index]["word"]
            ordered_dict_list[0].append(NoIndent(ordered_dict))

            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (index + 1, length))
                sys.stdout.flush()

        if self.verbose:
            print "\nOrganizing candidate words <= frequency threshold"
        length = len(pos_tagged_words_under_thre)
        for index in range(len(pos_tagged_words_under_thre)):
            ordered_dict = OrderedDict()
            ordered_dict["index"] = index + 1
            ordered_dict["count"] = sum([
                self.pos_tagged_statistics[w]
                for w in pos_tagged_words_under_thre[index]["word"]
            ])
            ordered_dict["stemmed_word"] = pos_tagged_words_under_thre[index][
                "stemmed_word"]
            ordered_dict["word"] = pos_tagged_words_under_thre[index]["word"]
            ordered_dict_list[1].append(NoIndent(ordered_dict))

            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (index + 1, length))
                sys.stdout.flush()

        if self.verbose:
            print "\n" + "-" * 80
            print "Saving data to: \033[1m" + self.dst + "\033[0m"
        with open(self.dst, "w+") as f_out:
            f_out.write(
                json.dumps(ordered_dict_list, indent=4, cls=NoIndentEncoder))

    def create_dir(self, new_path):
        """ create the directory if not exist"""

        dir1 = os.path.dirname(new_path)
        if not os.path.exists(dir1):
            if self.verbose:
                print "Creating directory: " + dir1
                print "-" * 80
            os.makedirs(dir1)

    def run(self):
        print "Selecting candidate words" + "\n" + "-" * 80

        self.stanford_ner()
        self.get_sentiment_words()
        self.render_candidate_lexicon()

    def PrintException(self):
        exc_type, exc_obj, tb = sys.exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        print '    Exception in ({}, LINE {} "{}"): {}'.format(
            filename, lineno, line.strip(), exc_obj)

Exemple #16

0

Afficher le fichier

def genQuestion(line):
    """
    outputs question from the given text
    """
    bucket = {}  # Create an empty dictionary

    # POS tagging
    text = CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split())
    for i, j in enumerate(text):  # text is the parts-of-speach tags in English
        if j[1] not in bucket:
            bucket[
                j[1]] = i  # Add all tags to the dictionary or bucket variable

    if type(line) is str:  # If the passed variable is of type string.
        line = TextBlob(line)  # Create object of type textblob.blob.TextBlob

    question = ''
    l1 = ['NNP', 'VBG', 'VBZ', 'IN']
    l2 = ['NNP', 'VBG', 'VBZ']

    l3 = ['PRP', 'VBG', 'VBZ', 'IN']
    l4 = ['PRP', 'VBG', 'VBZ']
    l5 = ['PRP', 'VBG', 'VBD']
    l6 = ['NNP', 'VBG', 'VBD']
    l7 = ['NN', 'VBG', 'VBZ']

    l8 = ['NNP', 'VBZ', 'JJ']
    l9 = ['NNP', 'VBZ', 'NN']

    l10 = ['NNP', 'VBZ']
    l11 = ['PRP', 'VBZ']
    l12 = ['NNP', 'NN', 'IN']
    l13 = ['NN', 'VBZ']

    # With the use of conditional statements the dictionary is compared with the list created above

    if all(key in bucket
           for key in l1):  #'NNP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l2):  #'NNP', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket
             for key in l3):  #'PRP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['PRP']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l4):  #'PRP', 'VBG', 'VBZ' in sentence.
        question = 'What ' + line.words[
            bucket['PRP']] + ' ' + ' does ' + line.words[
                bucket['VBG']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l7):  #'NN', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NN']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l8):  #'NNP', 'VBZ', 'JJ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + '?'

    elif all(key in bucket for key in l9):  #'NNP', 'VBZ', 'NN' in sentence
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + '?'

    elif all(key in bucket for key in l11):  #'PRP', 'VBZ' in sentence.
        if line.words[bucket['PRP']] in ['she', 'he']:
            question = 'What' + ' does ' + line.words[bucket['PRP']].lower(
            ) + ' ' + line.words[bucket['VBZ']].singularize() + '?'

    elif all(key in bucket for key in l10):  #'NNP', 'VBZ' in sentence.
        question = 'What' + ' does ' + line.words[bucket[
            'NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?'

    elif all(key in bucket for key in l13):  #'NN', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NN']] + '?'

    # When the tags are generated 's is split to ' and s. To overcome this issue.
    if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’":
        question = question.replace(" ’ ", "'s ")

    # Print the genetated questions as output.
    if question != '':
        print('\n', 'Question: ' + question)

Exemple #17

0

Afficher le fichier

Fichier : test_tagger.py Projet : chubbymaggie/diffSimilarTech

        'ORTH': 'JJR'
    }])


nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
add_patterns(matcher)

tech_pair = ["sortedlist", "sorteddictionary"]
tags = []
line = input(">>>")
while (line != "/"):
    flag = False
    tag_list = []
    words = line.split()
    tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words)
    if len(words) != len(tagged_words):
        tagged_words = pos_tag(words)
    for (word, tag) in tagged_words:
        # for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split()):
        if flag:
            word = "." + word
            flag = False
        if tag == "IN" and word in cin:
            tag_list.append("CIN")
        elif word in cv:
            tag_list.append("CV")
        elif word in tech_pair:
            tag_list.append("TECH")
        elif word == ".":
            flag = True

Exemple #18

0

Afficher le fichier

Fichier : extract_tech_pairs_aspects.py Projet : freddiewanah/diffSimilarTech

def main():
    information = {}
    sentences = set()
    for items in relations[pair]:
        sentences.add(items[5])
        information[items[5]] = (items[0], items[1], items[2], items[4])
    sentences = list(sentences)
    l = len(sentences)
    corpus = []
    topics = []
    for sentence in sentences:
        if pos_flag:
            words = sentence.split()
            words[-1] = words[-1].strip()
            tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words)
            if len(words) != len(tagged_words):
                tagged_words = pos_tag(words)
            # print(tagged_words)
            # print(sentence.strip())
            for phrase in stop_phrases:
                n = len(phrase)
                for i in range(len(tagged_words) - n + 1):
                    if phrase == words[i:i+n]:
                        for j in range(i, i+n):
                            tagged_words[j] = (None, tagged_words[j][1])
            i = 0
            indices = []
            keywords = []
            for (word, tag) in tagged_words:
                if word in pair:
                    indices.append(i)
                    keywords.append(word)
                    i += 1
                elif word not in stop_words and tag in pos_tag_set and word is not None:
                    keywords.append(word)
                    i += 1
            # topics.append(" ".join(keywords))
            # topics.append(sentence.strip())
            if len(keywords) <= 10 and flag:
                ws = [w for w in keywords if w not in pair]
            else:
                ws = []
                # if len(indices) == 2:
                #     for j in range(len(keywords)):
                #
                #         if j > indices[0] and j <= indices[0] + 4 and keywords[j] not in pair and j < indices[1]:
                #             ws.append(keywords[j])
                #         elif j >= indices[1] - 2 and j <= indices[1] + 2 and keywords[j] not in pair:
                #             ws.append(keywords[j])
                # else:
                if True:
                    for j in range(len(keywords)):
                        for i in indices:
                            if j >= i - 2 and j <= i + 2 and keywords[j] not in pair and keywords[j] not in ws:
                                ws.append(keywords[j])
                                break
            # with open(keywords_path, "a") as keywords_file:
            #     keywords_file.write(",".join(ws)+"\n")
            #     keywords_file.write(sentence+"\n")
            corpus.append(ws)
            topics.append(" ".join(ws))
        else:
            corpus.append([w for w in sentence.split() if w not in stop_words])

    if query_flag:
        with open(os.path.join(os.pardir, "keywords", "corpus.pkl"), 'wb') as corpus_file:
            pickle.dump(corpus, corpus_file)
        with open(os.path.join(os.pardir, "keywords", "sentences.pkl"), 'wb') as sentences_file:
            pickle.dump(sentences, sentences_file)

    else:
        # Prepare word2vector model
        fname = os.path.join(os.pardir, "data", "mymodel")
        model = gensim.models.Word2Vec.load(fname)
        model.init_sims(replace=True)

        # Build weighted graph
        # dictionary = Dictionary(corpus)
        # bow_corpus = [dictionary.doc2bow(document) for document in corpus]

        index = WmdSimilarity(corpus, model)


        def set_shreshold(a, b):
            if ver_flag:
                if a == b:
                    return 0.52
                return 0.55 - 0.05 ** abs(a - b)
            else:
                if a == b:
                    return 0.55
                elif a > 3 or b > 3:
                    return 0.55 - 0.1 ** abs(a - b)
                return 0.55 - 0.05 ** abs(a - b)

        G = nx.Graph()
        for i in range(l - 1):
            sims = index[corpus[i]]
            # print("query:")
            # print(corpus[i])
            # print(sentences[i])
            # print("sims:")
            for j in range(i + 1, l):
                # print(sims[j])
                # print(corpus[j])
                # print(sentences[j])
                # print()
                shreshold = set_shreshold(len(corpus[i]), len(corpus[j]))
                if sims[j] >= shreshold:
                    if i not in G: G.add_node(i)
                    if j not in G: G.add_node(j)
                    G.add_edge(i, j)
                    # G.add_edge(i, j, weight=sims[j])


        out_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.txt".format("&".join(pair), G.number_of_nodes(), l))
        image_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.png".format("&".join(pair), G.number_of_nodes(), l))

        # Draw graph
        pos = nx.spring_layout(G)
        plt.figure(figsize=(19,12))
        plt.axis('off')
        nx.draw_networkx_nodes(G, pos, node_size=50)
        nx.draw_networkx_edges(G, pos, width=0.75)
        plt.savefig(image_path)
        # plt.show()

        nnodes = G.number_of_nodes()

        if nnodes < 4:
            communities = []
            communities.append(G.nodes())
        elif nnodes <= 15:
            communities_generator = community.girvan_newman(G)
            temp_communities = next(communities_generator)
            communities = sorted(map(sorted, temp_communities))
        else:
            if nnodes < 50:
                part = 2 / 3
            else:
                part = 1 / 3
            # Detect communities
            communities_generator = community.girvan_newman(G)
            div_flag = True
            while div_flag:
                temp_communities = next(communities_generator)
                communities = sorted(map(sorted, temp_communities))
                div_flag = False
                for com in communities:
                    if len(com) > l * part:
                        div_flag = True
                        break
        num = 0
        graph_indices = set()
        bloblist = []
        clusters = []
        for com in communities:
            if len(com) > 1:
                doc = ""
                for i in com:
                    doc += topics[i] + " "
                bloblist.append(tb(doc))
                clusters.append(com)


        with open(out_path, "a") as out_file:
            for i, blob in enumerate(bloblist):
                print("Top words in document {}".format(i + 1))
                scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
                sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
                word_num = 0
                for word, score in sorted_words:
                    if word_num == 3:
                        break
                    if tf(word, blob) >= 0.2:
                        word_num += 1
                        out_file.write(word+", ")
                        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
                out_file.write("---------------------------------------------------\n\n")
                for j in clusters[i]:
                    out_file.write(",".join(corpus[j])+"\n")
                    out_file.write(sentences[j]+"\n")
                    graph_indices.add(j)
                num += 1
            out_file.write("other---------------------------------------------------\n\n")
            for j in range(len(sentences)):
                if j not in graph_indices:
                    out_file.write(",".join(corpus[j])+"\n")
                    out_file.write(sentences[j]+"\n")

Exemple #19

0

Afficher le fichier

from nltk.tag.stanford import CoreNLPPOSTagger
from nltk.tag.stanford import CoreNLPNERTagger
from nltk.stem.wordnet import WordNetLemmatizer

url = "http://localhost:9000/tregex"
request_params = {"pattern": " NP=n1 !>> NP >> (VP > (S=n2 > ROOT)) "}
text = "John would have loved Anne."
print(text)

r = requests.post(url, data=text, params=request_params)
json_data = json.loads(r.text)
text1 = json_data['sentences'][0]['0']['match']
tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
line = tree.leaves()

text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split())
c = 0
for tagg in text_pos:
    #line = 'John loved Anne.'
    if tagg[1] == "VBD" and text_pos[c][0] != 'had' and text_pos[
            c + 1][1] != 'VBG':
        verb_tense = "did"
        root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
        text = text.replace(tagg[0], root_verb)

    #line = 'John love Anne.'
    if tagg[1] == "VBP" and text_pos[c][0] != 'is' and text_pos[c][
            0] != 'are' and text_pos[c][0] != 'have':
        verb_tense = "do"
        root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
        text = text.replace(tagg[0], root_verb)

Exemple #20

0

Afficher le fichier

from nltk.tag.stanford import CoreNLPPOSTagger
from nltk.parse.corenlp import CoreNLPDependencyParser
import spacy
from spacy.matcher import Matcher

# dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
#
# while True:
#     parse, = dep_parser.raw_parse(input(">>>"))
#
#     for governor, dep, dependent in parse.triples():
#         print(governor, dep, dependent)

line = input(">>>")
print(line.split(" "))
for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(
        line.split(" ")):
    print(word)
    print(tag)
print(tag)

Exemple #21

0

Afficher le fichier

    def modify(self):
        url = "http://localhost:9000/tregex"
        request_params = {"pattern":  " NP=n1 !>> NP >> (VP > (S=n2 > ROOT)) "  }
        text = self.text
        r = requests.post(url, data=text, params=request_params)
        json_data = json.loads(r.text)
        text1 = json_data['sentences'][0]['0']['match']
        tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
        line = tree.leaves()

        text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split())
        c = 0
        for tagg in text_pos:
            #line = 'John loved Anne.'	
            if tagg[1]== "VBD" and text_pos[c][0]!= 'had' and text_pos[c+1][1]!='VBG':
                verb_tense = "did"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v')
                text = text.replace(tagg[0],root_verb)

            #line = 'John love Anne.'
            if tagg[1]=="VBP" and text_pos[c][0]!= 'is' and text_pos[c][0]!='are' and text_pos[c][0]!= 'have':
                verb_tense = "do"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v')
                text = text.replace(tagg[0],root_verb)
            
            #line = 'John loves Anne.'
            if tagg[1]=="VBZ" and text_pos[c+1][1]!='VBN' and text_pos[c+1][1]!='VBG':
                verb_tense = "does"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v')
                text = text.replace(tagg[0],root_verb)

            #line = 'John is playing with Anne'
                    #who is john playig with?
            #line = 'John was playing with Anne.'
                    #who was john playing with?
            #line = 'John is going to play with Anne.'
                    #who is john going to play with?

            if tagg[1]=="VBG" and text_pos[c-1][1]!='VB' and text_pos[c-1][1]!='VBN':
                verb_tense = text_pos[c-1][0]
                text = text.replace(text_pos[c-1][0]+" ","")


            #line = 'John has loved Anne.'	
            if tagg[1]=="VBZ" and text_pos[c+1][1]=='VBN' and text_pos[c+2][1]!="VBG":
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0]+" ","")

            #line = 'John will be playing with Anne.'
            if tagg[1]=="VBG" and text_pos[c-1][1]=='VB':
                verb_tense = text_pos[c-2][0]
                text = text.replace(text_pos[c-2][0]+" ","")

            #line = 'John has been playing with Anne.'
            #line = 'John had been playing with Anne.'
            if (tagg[1]=="VBZ" or tagg[1]=="VBD") and text_pos[c+1][1]=='VBN' and text_pos[c+2][1]== 'VBG':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0]+" ","")

            #line = 'John had loved Anne.'
            #line = 'We have loved Anne.'
            if tagg[1]=="VBN" and tagg[0]!='been' and (text_pos[c-1][0]== 'had' or text_pos[c-1][0]== 'have') and text_pos[c-2][1]!='MD':
                verb_tense = text_pos[c-1][0]
                text = text.replace(text_pos[c-1][0]+" ","")

            #line = 'John will have played with Anne.'
            #line = 'John will love Anne.'
            #line = 'John would have loved Anne.'
            if tagg[1]=="MD" and text_pos[c+1][1]=='VB':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0]+" ","")
            c  = c + 1


        obj = ' '.join(line)
        classified_text = CoreNLPNERTagger(url='http://localhost:9000').tag(obj.split())
        f = 1
        for i in classified_text:
            if i[1]!='PERSON':
                f = 0
                break

        text = text.replace(obj,'')
        text = text.replace("."," ?")
        Q = 'Who '+verb_tense+' '+text
        return Q