def tag_tokens_using_stanford_corenlp(token_list, corenlp_server_address='http://localhost:9000'): # print("tag_tokens_using_stanford_corenlp started") tagger = CoreNLPPOSTagger(url=corenlp_server_address) # The piece of code below is exists to deal with a limitation of the Stanford's coreNLP Server that only # supports 100000 characters per server call. So this will break the text in a lot of smaller pieces and send # them to the server and after will unite them all in one list of tagged words ('tagged_text') tagged_text = [] txt_size = len(token_list) i = 0 while i < txt_size: if i + 6000 >= txt_size: tokens_to_tag = token_list[i:txt_size] i = txt_size else: tokens_to_tag = token_list[i:i + 6000] i += 6000 tagged_text += tagger.tag(tokens_to_tag) # print("tag_tokens_using_stanford_corenlp ended") return tagged_text
def __init__(self, config_path=DEFAULT_CONFIG_PATH): self.config = load_config(config_path) corenlp_config = self.config["data"]["stanford_corenlp"] self.tagger = CoreNLPPOSTagger( url="http://%s:%d" % (corenlp_config["host"], corenlp_config["port"])) self.pos_map = self.config["model"]["STANFORD_POS_MAP"]
def get_pos_tag(techs, words): """ Get POS tag of words. ([str], [str]) -> ([str], [str]) """ tags = [] flag = False tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words) if len(words) != len(tagged_words): tagged_words = pos_tag(words) words = [] for (word, tag) in tagged_words: if flag: word = "." + word flag = False if tag == "IN" and word in cin: tags.append("CIN") elif word in cv: tags.append("CV") elif word in techs: tags.append("TECH") elif word == ".": flag = True continue elif tag[:2] == "VB": tags.append("VB") else: tags.append(tag) words.append(word) return (words, tags)
def count(file_name): with open(os.path.join(os.path.pardir, "out", "tech_v6", file_name)) as data3_file: num = 0 for line in data3_file: if num % 4 == 2: words = line.split(" ") words[-1] = words[-1].strip() for (word, tag) in CoreNLPPOSTagger( url='http://localhost:9000').tag(words): if word not in stopwords_en and word not in modal_verbs and word not in synonyms: if word in wf: wf[word] += 1 else: wf[word] = 1 if tag[:2] == "JJ": if word in jj: jj[word] += 1 else: jj[word] = 1 elif tag[:2] == "NN": if word in nn: nn[word] += 1 else: nn[word] = 1 elif tag[:2] == "RB": if word in rb: rb[word] += 1 else: rb[word] = 1 num += 1
def classify(no): num = 0 compa_sent_count = 0 current_id = 0 try: nlp = spacy.load('en') matcher = Matcher(nlp.vocab) add_patterns(matcher) # with io.open(os.path.join(os.pardir, "out", "tech_v5", "{}.txt".format(no)), "r", encoding="utf-8") as data_file: with open(os.path.join(os.pardir, "out", "tech_v6", "{}.txt".format(no))) as data_file: compa_sent_count = 0 for line in data_file: if num % 4 == 0: current_id = line elif num % 4 == 1: tech_pair = line.split("\t") tech_pair[-1] = tech_pair[-1].strip() elif num % 4 == 2: tag_list = [] # for token in doc: # tag = token.tag_ # word = token.text # print(line) flag = False for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split(" ")): # for (word, tag) in nltk.pos_tag(line.split(" ")): if flag: word = "." + word flag = False if tag == "IN" and word in cin: tag_list.append("CIN") elif word in cv: tag_list.append("CV") elif word in tech_pair: tag_list.append("TECH") elif word == ".": flag = True else: tag_list.append(tag) pos_tag = " ".join(tag_list) patterns = matcher(nlp(pos_tag)) if patterns != []: compa_sent_count += 1 data_file = open(os.path.join(os.pardir, "out", "tech_v6", "sentences_1.txt"), "a") data_file.write("{}".format(current_id)) data_file.write("{}\n".format("\t".join(tech_pair))) for pattern in patterns: data_file.write("pattern"+str(pattern[0])+"\t") data_file.write(str("\n{}\n".format(line))) data_file.close() num += 1 finally: print("Proc {}: {}/{} from - to {}".format(os.getpid(), compa_sent_count, num/4, current_id))
def create_parse_trees(sentences): """ Create Parse tree for each sentence in sentences list and return all trees in a list. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the CoreNLP Parser. :param sentences: Input sentences for parsing. :type sentences: list(str) :return: list(Tree) """ # Create Stanford Parser. stanford_parser = CoreNLPPOSTagger() # Create a list to store all sentences parsed trees. parsed_sentences_trees = [] # Create parsed trees ans store to list. for sentence in sentences: for line in stanford_parser.raw_parse(sentence): temp_tree = Tree.fromstring(str(line)) parsed_sentences_trees.append(temp_tree) return parsed_sentences_trees
def __init__(self, src_folder="../data/", freq_thre=100, corenlp_path="../stanford-corenlp/",\ ner_path="../stanford-ner/", verbose=False): # initialization self.src = os.path.join(src_folder, "reviews/") self.corenlp_path = os.path.normpath(corenlp_path) + "/" self.stanford_ner_path = os.path.normpath(ner_path) + "/" self.frequency_threshold = freq_thre self.dst = os.path.join(src_folder, "lexicon/candidates.json") self.dst_allReviews = os.path.join(src_folder, "allReviews/") self.dst_ner_tsv = os.path.join(src_folder, "ner_tsv/") self.dst_ne = os.path.join(src_folder, "ne/") self.verbose = verbose # pick up sentiment words self.pos_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"] self.pos_tagged_statistics = {} # it is based on CoreNLP, a new version of stanford pos tagger self.pos_tagger = CoreNLPPOSTagger() self.stemmer = SnowballStemmer("english") self.stopwords = set(stopwords.words("english")) # remove `not` because we need combine `not` and sentiment words self.stopwords.remove("not")
def modify(self): url = "http://localhost:9000/tregex" request_params = {"pattern": " SBAR|VP|NP=app $, /,/ "} # text = "Mexico City, the biggest city in the world, has many interesting archaeological sites." text = self.text # print(text) r = requests.post(url, data=text, params=request_params) json_data = json.loads(r.text) text1 = json_data['sentences'][0]['0']['match'] tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0]) line = tree.leaves() appos = '' begin_text = '' for tag in line: appos = appos + tag+' ' text = text.replace(',', '') result = text.index(appos) text = text.replace(appos, '') for x in range(0,result): begin_text = begin_text + text[x] doc = nlp(begin_text) for ent in doc.ents: sub_ent = ent.label_ if sub_ent == 'GPE' or sub_ent == 'LOC': text = text.replace(begin_text, '') text1 = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split()) for tagg in text1: # line = 'She ate the fruits.' if tagg[1] == "VBD": tense = "was" # line = 'We eat the fruits.' if tagg[1] == "VBP": tense = "is" # line = 'She eats the fruits.' if tagg[1] == "VBZ": tense = "is" qts = "Which/Where" qts = qts + ' ' + tense + ' ' + appos + '?' return qts
class StanfordPOSAnnotator(Annotator): def __init__(self, config_path=DEFAULT_CONFIG_PATH): self.config = load_config(config_path) corenlp_config = self.config["data"]["stanford_corenlp"] self.tagger = CoreNLPPOSTagger( url="http://%s:%d" % (corenlp_config["host"], corenlp_config["port"])) self.pos_map = self.config["model"]["STANFORD_POS_MAP"] def annotate(self, annotable): if (annotable.__class__.__name__ == "Document"): return self.annotate_document(annotable) elif (annotable.__class__.__name__ == "Sentence"): return self.annotate_sentence(annotable) else: raise AnnotationError( "This annotator only accepts Document or Sentence annotables.") def annotate_document(self, document): for sentence in document.sentences: self.annotate_sentence(sentence) def annotate_sentence(self, sentence): token_list = [token.surface for token in sentence.tokens] tagged_tokens = self.tagger.tag(token_list) for i in range(len(token_list)): sentence.tokens[i].annotations["STANFORD_POS"] = tagged_tokens[i][ 1] for pos_rgx in self.pos_map: if (re.match(pos_rgx, tagged_tokens[i][1])): sentence.tokens[i].annotations["POS"] = self.pos_map[ pos_rgx].split("|")[0] if ("POS" not in sentence.tokens[i].annotations): sentence.tokens[i].annotations["POS"] = "x"
from projectFiles import utils from projectFiles.Utils import xlsxUtils import pandas as pd import numpy as np from nltk.tag.stanford import CoreNLPPOSTagger from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown.dat') tagger = CoreNLPPOSTagger(url='http://localhost:9000') def calculate_semantic_sim(word1, word2, pos1='n', pos2='n'): list_of_synsets1 = wn.synsets(word1, pos=pos1) list_of_synsets2 = wn.synsets(word2, pos=pos2) if not list_of_synsets1: return 0.0001 if not list_of_synsets2: return 0.0001 s1 = list_of_synsets1[0] s2 = list_of_synsets2[0] total_value = 0 value = s1.wup_similarity(s2) value = utils.limit_value(value, 0.0001, 1.0) total_value += value
from nltk.tag.stanford import CoreNLPNERTagger, CoreNLPPOSTagger from nltk.tokenize.stanford import CoreNLPTokenizer stpos, stner = CoreNLPPOSTagger('http://localhost:9001'), CoreNLPNERTagger( 'http://localhost:9001') sttok = CoreNLPTokenizer('http://localhost:9001') sttok.tokenize(u'你好') stpos.tag(u'basf') stpos.tag(sttok.tokenize(u'text')) stner.tag(u'你好') stner.tag(sttok.tokenize(u'你好'))
def main(): information = {} sentences = set() for items in relations[pair]: sentences.add(items[5]) information[items[5]] = (items[0], items[1], items[2], items[4]) sentences = list(sentences) l = len(sentences) corpus = [] topics = [] for sentence in sentences: if pos_flag: words = sentence.split() words[-1] = words[-1].strip() tagged_words = CoreNLPPOSTagger( url='http://localhost:9000').tag(words) if len(words) != len(tagged_words): tagged_words = pos_tag(words) # print(tagged_words) # print(sentence.strip()) for phrase in stop_phrases: n = len(phrase) for i in range(len(tagged_words) - n + 1): if phrase == words[i:i + n]: for j in range(i, i + n): tagged_words[j] = (None, tagged_words[j][1]) i = 0 indices = [] keywords = [] for (word, tag) in tagged_words: if word in pair: indices.append(i) keywords.append(word) i += 1 elif word not in stop_words and tag in pos_tag_set and word is not None: keywords.append(word) i += 1 # topics.append(" ".join(keywords)) # topics.append(sentence.strip()) if len(keywords) <= 10 and flag: ws = [w for w in keywords if w not in pair] else: ws = [] # if len(indices) == 2: # for j in range(len(keywords)): # # if j > indices[0] and j <= indices[0] + 4 and keywords[j] not in pair and j < indices[1]: # ws.append(keywords[j]) # elif j >= indices[1] - 2 and j <= indices[1] + 2 and keywords[j] not in pair: # ws.append(keywords[j]) # else: if True: for j in range(len(keywords)): for i in indices: if j >= i - 2 and j <= i + 2 and keywords[ j] not in pair and keywords[j] not in ws: ws.append(keywords[j]) break # with open(keywords_path, "a") as keywords_file: # keywords_file.write(",".join(ws)+"\n") # keywords_file.write(sentence+"\n") corpus.append(ws) topics.append(" ".join(ws)) else: corpus.append([w for w in sentence.split() if w not in stop_words]) if query_flag: with open(os.path.join(os.pardir, "keywords", "corpus.pkl"), 'wb') as corpus_file: pickle.dump(corpus, corpus_file) with open(os.path.join(os.pardir, "keywords", "sentences.pkl"), 'wb') as sentences_file: pickle.dump(sentences, sentences_file) else: # Prepare word2vector model fname = os.path.join(os.pardir, "data", "mymodel") model = gensim.models.Word2Vec.load(fname) model.init_sims(replace=True) # Build weighted graph # dictionary = Dictionary(corpus) # bow_corpus = [dictionary.doc2bow(document) for document in corpus] index = WmdSimilarity(corpus, model) G = nx.Graph() for i in range(l - 1): sims = index[corpus[i]] # print("query:") # print(corpus[i]) # print(sentences[i]) # print("sims:") for j in range(i + 1, l): # print(sims[j]) # print(corpus[j]) # print(sentences[j]) # print() shreshold = set_shreshold(len(corpus[i]), len(corpus[j])) if sims[j] >= shreshold: if i not in G: G.add_node(i) if j not in G: G.add_node(j) G.add_edge(i, j) # G.add_edge(i, j, weight=sims[j]) out_path = os.path.join( os.pardir, "{}_{}_{}.txt".format("&".join(pair), G.number_of_nodes(), l)) # image_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.png".format("&".join(pair), G.number_of_nodes(), l)) # Draw graph pos = nx.spring_layout(G) plt.figure(figsize=(19, 12)) plt.axis('off') nx.draw_networkx_nodes(G, pos, node_size=50) nx.draw_networkx_edges(G, pos, width=0.75) #first compute the best partition communities = [] partition = community.best_partition(G) for com in set(partition.values()): list_nodes = [ nodes for nodes in partition.keys() if partition[nodes] == com ] communities.append(list_nodes) num = 0 graph_indices = set() bloblist = [] clusters = [] for com in communities: if len(com) > 1: doc = "" for i in com: doc += topics[i] + " " bloblist.append(tb(doc)) clusters.append(com) aspects[pair] = set() new_aspects[pair] = {} # if True: with open(out_path, "a") as out_file: for i, blob in enumerate(bloblist): # print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) # word_num = 0 aspect_keywords = [] for word, score in sorted_words[:3]: out_file.write(word + ", ") aspect_keywords.append(word) new_aspects[pair][" ".join(aspect_keywords)] = set() # for word, score in sorted_words: # if word_num == 3: # break # if tf(word, blob) >= 0.2: # word_num += 1 # out_file.write(word+", ") # print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) out_file.write( "---------------------------------------------------\n\n") for j in clusters[i]: temp = information[sentences[j]] new_aspects[pair][" ".join(aspect_keywords)].add( (temp[0], temp[1], temp[2], temp[3], sentences[j])) aspects[pair].add( (temp[0], temp[1], temp[2], " ".join(aspect_keywords), temp[3], sentences[j])) out_file.write(",".join(corpus[j]) + "\n") out_file.write(sentences[j] + "\n") graph_indices.add(j) num += 1 out_file.write( "other---------------------------------------------------\n\n") new_aspects[pair]["other"] = set() for j in range(len(sentences)): if j not in graph_indices: temp = information[sentences[j]] new_aspects[pair]["other"].add( (temp[0], temp[1], temp[2], temp[3], sentences[j])) aspects[pair].add( (temp[0], temp[1], temp[2], "", temp[3], sentences[j])) out_file.write(",".join(corpus[j]) + "\n") out_file.write(sentences[j] + "\n") plt.close('all')
import nltk from nltk.parse.corenlp import CoreNLPParser from nltk.tag.stanford import CoreNLPPOSTagger from pycorenlp import StanfordCoreNLP from brain import memory from brain.conjugator import conjugator DEFAULT_TAGS = ['NNP', 'NNPS', 'NN', 'NNS'] KEYWORD_TAGS = DEFAULT_TAGS[:] KEYWORD_TAGS.extend(['VBG']) #KEYWORD_TAGS.extend(['VBG', 'PRP', 'PRP$', 'WP', 'WP$', 'WRB', 'WDT']) STANFORD_TAGGER = CoreNLPPOSTagger('http://localhost:9000/') STANFORD_SERVER = StanfordCoreNLP('http://localhost:9000/') STANFORD_PARSER = CoreNLPParser('http://localhost:9000/') def combine_similar(input, tags): output = [] curr = [] tag = "" for x in input: if x[1] not in tags: if len(curr) > 0: output.append((" ".join([x[0] for x in curr]), tag)) curr[:] = [] tag = "" output.append(x) elif x[1] == tag: curr.append(x)
def modify(self): url = "http://localhost:9000/tregex" request_params = { "pattern": " RB=n1 > (ADVP >> (S=n2 > ROOT)) | > (ADJP >> (S=n2 > ROOT))" } text = self.text # print(text) r = requests.post(url, data=text, params=request_params) json_data = json.loads(r.text) text1 = json_data['sentences'][0]['0']['match'] tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0]) str1 = tree.leaves() adverb = ' '.join(str1) text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag( text.split()) #print(text_pos) c = 0 for tagg in text_pos: if (c == 0 and tagg[1] != "NNP" and tagg[0] != 'I'): s = tagg[0].lower() text = text.replace(tagg[0], s) #line = 'He ran quickly.' if tagg[1] == "VBD" and text_pos[c][0] != 'had' and text_pos[ c + 1][1] != 'VBG': verb_tense = "did" root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v') text = text.replace(tagg[0], root_verb) #line = 'I run quickly.' if tagg[1] == "VBP" and text_pos[c][0] != 'is' and text_pos[c][ 0] != 'are' and text_pos[c][0] != 'have': verb_tense = "do" root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v') text = text.replace(tagg[0], root_verb) #line = 'John runs quickly.' if tagg[1] == "VBZ" and text_pos[c + 1][1] != 'VBN' and text_pos[ c + 1][1] != 'VBG': verb_tense = "does" root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v') text = text.replace(tagg[0], root_verb) #line = 'John is playing quietly.' #line = 'John was playing quietly.' #line = 'John is going to play quietly.' if tagg[1] == "VBG" and text_pos[c - 1][1] != 'VB' and text_pos[ c - 1][1] != 'VBN': verb_tense = text_pos[c - 1][0] text = text.replace(text_pos[c - 1][0] + " ", "") #line = 'John has ran quickly.' if tagg[1] == "VBZ" and text_pos[ c + 1][1] == 'VBN' and text_pos[c + 2][1] != "VBG": verb_tense = text_pos[c][0] text = text.replace(text_pos[c][0] + " ", "") #line = 'John will be playing quietly.' if tagg[1] == "VBG" and text_pos[c - 1][1] == 'VB': verb_tense = text_pos[c - 2][0] text = text.replace(text_pos[c - 2][0] + " ", "") #line = 'John has been playing quietly.' #line = 'John had been playing quietly.' if (tagg[1] == "VBZ" or tagg[1] == "VBD") and text_pos[ c + 1][1] == 'VBN' and text_pos[c + 2][1] == 'VBG': verb_tense = text_pos[c][0] text = text.replace(text_pos[c][0] + " ", "") #line = 'John had left quietly.' #line = 'We have eaten the meal quietly.' if tagg[1] == "VBN" and tagg[0] != 'been' and ( text_pos[c - 1][0] == 'had' or text_pos[c - 1][0] == 'have') and text_pos[c - 2][1] != 'MD': verb_tense = text_pos[c - 1][0] text = text.replace(text_pos[c - 1][0] + " ", "") #line = 'John will run quickly.' #line = 'John would have ran quickly.' if tagg[1] == "MD" and text_pos[c + 1][1] == 'VB': verb_tense = text_pos[c][0] text = text.replace(text_pos[c][0] + " ", "") c = c + 1 """ obj="" for i in line: classified_text = st.tag(word_tokenize(i)) if classified_text[0][1]!='PERSON': break obj = obj + classified_text[0][0]+ " " """ text = text.replace(".", " ?") text = text.replace(adverb, "") Q = 'How ' + verb_tense + ' ' + text return Q
class SelectCandidates: """ This program aims to select candidate words from reviews We picks up sentiment words and handles the negation problem The result will be stored in `src_folder/lexicon/candidates.json` """ def __init__(self, src_folder="../data/", freq_thre=100, corenlp_path="../stanford-corenlp/",\ ner_path="../stanford-ner/", verbose=False): # initialization self.src = os.path.join(src_folder, "reviews/") self.corenlp_path = os.path.normpath(corenlp_path) + "/" self.stanford_ner_path = os.path.normpath(ner_path) + "/" self.frequency_threshold = freq_thre self.dst = os.path.join(src_folder, "lexicon/candidates.json") self.dst_allReviews = os.path.join(src_folder, "allReviews/") self.dst_ner_tsv = os.path.join(src_folder, "ner_tsv/") self.dst_ne = os.path.join(src_folder, "ne/") self.verbose = verbose # pick up sentiment words self.pos_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"] self.pos_tagged_statistics = {} # it is based on CoreNLP, a new version of stanford pos tagger self.pos_tagger = CoreNLPPOSTagger() self.stemmer = SnowballStemmer("english") self.stopwords = set(stopwords.words("english")) # remove `not` because we need combine `not` and sentiment words self.stopwords.remove("not") def stanford_ner(self): """ call stanford java ner api """ self.merge_reviews() self.run_ner() self.find_named_entity() def merge_reviews(self): """ merge all reviews for named entity recognition """ if self.verbose: print "Merging all reviews for named entity recognition" + "\n" + "-" * 80 self.create_dir(self.dst_allReviews) for dirpath, dirs, files in os.walk(self.src): for f in files: filename = re.search( "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json", f).group(1) data = json.load(open(os.path.join(dirpath, f))) with open(os.path.join(self.dst_allReviews, filename + ".txt"), "w+") as rf: for r in data["reviews"]: text = r["review"] # remove accents text = unicodedata.normalize("NFKD", text).encode( "ASCII", "ignore") # remove all website urls written in the review text = re.sub(r"https?:\/\/.*[\r\n]*", " ", text, flags=re.MULTILINE) # remove non english letters or words and numbers text = re.sub( r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "", text) # remove extra nextline text = re.sub("(\\n)+", r" ", text) # I'm -> I am text = re.sub(r"'m ", " am ", text) text = re.sub(r"'re ", " are ", text) text = re.sub(r"'s ", " is ", text) text = re.sub(r"'ve ", " have ", text) text = re.sub(r"'d ", " would ", text) text = re.sub(r" won't ", " will not ", text) text = re.sub(r"n't ", " not ", text) text = re.sub(r"'ll ", " will ", text) # remove all punctuations except for , . ? ! ; : and - # -: composite adj. text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text) # Space out every sign & symbol & punctuation text = re.sub("([^\w\s])", r" \1 ", text) text = text.replace("\'", "") # remove ` - `, ` -`, `- ` text = re.sub(r"(\-)+", "-", text) text = re.sub( r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ", text) # turn multiple spaces into one text = re.sub(r"(\s)+", " ", text) # remove extra space at both ends of the text text = text.strip() rf.write(text) rf.write("\n\n. CHANGE-REVIEW .\n\n") def run_ner(self): """ run shell to call NER """ if self.verbose: print "Running shell to call Stanford NER" + "\n" + "-" * 80 self.create_dir(self.dst_ner_tsv) comm = "java -mx1g -cp \"%s*:%slib/*\" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier %sclassifiers/english.all.3class.distsim.crf.ser.gz -outputFormat tabbedEntities -textFile %s > %s" for dirpath, dirs, files in os.walk(self.dst_allReviews): for f in files: filename = re.search( "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).txt", f).group(1) src_file = os.path.join(dirpath, f) dst_file = os.path.join(self.dst_ner_tsv, filename + ".tsv") command = comm % (self.stanford_ner_path, self.stanford_ner_path, self.stanford_ner_path, src_file, dst_file) subprocess.call(command, shell=True) def find_named_entity(self): """ find named entity from the ner tsv """ if self.verbose: print "Finding named entity from ner tsv files" + "\n" + "-" * 80 self.create_dir(self.dst_ne) for dirpath, dirs, files in os.walk(self.dst_ner_tsv): for f in files: filename = re.search( "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).tsv", f).group(1) src_file = os.path.join(dirpath, f) dst_file = os.path.join(self.dst_ne, filename + ".txt") rs = [set()] with open(src_file, "rb") as tsvin: data = csv.reader(tsvin, delimiter="\t") for r in data: if len(r) != 0 and r[0] != "": if r[1] == "ORGANIZATION" or r[1] == "PERSON" or r[ 1] == "LOCATION": l = r[0].split(" ") for i in l: if (i, r[1]) not in rs: rs[-1].add((i, r[1])) elif len(r) > 2 and "CHANGE-REVIEW" in r[2]: rs.append(set()) with open(dst_file, "w+") as rf: for rs_index in range(len(rs) - 1): rf.write(str(rs_index) + ",FILEINDEX\n") for i in rs[rs_index]: rf.write(i[0] + "," + i[1] + "\n") def get_sentiment_words(self): """ load all reviews in src folder: data/reviews/ and merge them """ # start Stanford CoreNLP server in a new process comm = "java -mx4g -cp \"%s*\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 50000" command = comm % (self.corenlp_path) proc = subprocess.Popen(command, shell=True, preexec_fn=os.setsid) time.sleep(10) # wait for starting Stanford CoreNLP server for dirpath, dir_list, file_list in os.walk(self.src): if self.verbose: print "Walking into directory: " + str(dirpath) if len(file_list) > 0: for f in file_list: # in case there is a goddamn .DS_Store file if str(f) == ".DS_Store": if self.verbose: print "Removing " + dirpath + "/" + str(f) os.remove(os.path.join(dirpath, f)) else: with open(os.path.join(dirpath, f)) as fp: entity = json.load(fp) if self.verbose: print "Processing " + "\033[1m" + entity[ "entity"] + "\033[0m" + " in " + "\033[1m" + entity[ "category"] + "\033[0m" self.analyze_part_of_speech(entity["reviews"], f) else: if self.verbose: print "No file is found in " + str(dirpath) os.killpg(os.getpgid(proc.pid), signal.SIGTERM) if self.verbose: print "Part of Speech Analysis on Reviews are Done" print "-" * 80 def analyze_part_of_speech(self, reviews, filename): """ run nltk.pos_tag to analysis the part_of_speech of every word """ ner_set = self.load_ner_tags(filename) for review_index in range(len(reviews)): text = reviews[review_index]["review"] # remove accents text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore") # remove all website urls written in the review text = re.sub(r"https?:\/\/.*[\r\n]*", " ", text, flags=re.MULTILINE) # remove non english letters or words and numbers text = re.sub(r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "", text) # remove extra nextline text = re.sub("(\\n)+", r" ", text) # I'm -> I am text = re.sub(r"'m ", " am ", text) text = re.sub(r"'re ", " are ", text) text = re.sub(r"'s ", " is ", text) text = re.sub(r"'ve ", " have ", text) text = re.sub(r"'d ", " would ", text) text = re.sub(r" won't ", " will not ", text) text = re.sub(r"n't ", " not ", text) text = re.sub(r"'ll ", " will ", text) # remove all punctuations except for , . ? ! ; : and - # -: composite adj. text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text) # space out every sign & symbol & punctuation text = re.sub("([^\w\s])", r" \1 ", text) text = text.replace("\'", "") # remove ` - `, ` -`, `- ` text = re.sub(r"(\-)+", "-", text) text = re.sub(r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ", text) # turn multiple spaces into one text = re.sub(r"(\s)+", " ", text) # remove extra space at both ends of the text text = text.strip() # tokenize tokenized_text = text.split(" ") # remove empty string tokenized_text = [w for w in tokenized_text if w] # pos tag # a list of word tuples # [("great", "JJ"), ("tour", "NN") ...] if len(tokenized_text) == 0: continue word_tuple_list = self.pos_tagger.tag(tokenized_text) # remove stop_words word_tuple_list = [(w[0].lower(), w[1]) for w in word_tuple_list if w[0].lower() not in self.stopwords] # remove empty string word_tuple_list = [(w[0], w[1]) for w in word_tuple_list if w[0]] combine_or_not = False combination_front = "" for word_tuple in word_tuple_list: # putting them into dictionary # add 1 to value if exist # add key and value if not if word_tuple[1] not in self.pos_tags: if combine_or_not: if combination_front in self.pos_tagged_statistics: self.pos_tagged_statistics[combination_front] += 1 else: self.pos_tagged_statistics[combination_front] = 1 combine_or_not = False combination_front = "" elif word_tuple[0] not in ner_set[review_index]: if combine_or_not: if combination_front: combination_front += "_" + word_tuple[0] else: combination_front = word_tuple[0] else: combine_or_not = True combination_front = word_tuple[0] if combine_or_not: if combination_front in self.pos_tagged_statistics: self.pos_tagged_statistics[combination_front] += 1 else: self.pos_tagged_statistics[combination_front] = 1 def stem(self, candidate_lexicon): """ perform stemming on candidate lexicon | candidate lexicon should be a list """ stemmed_lexicon = [] for word in candidate_lexicon: stemmed_word = self.stemmer.stem(word) stemmed_lexicon.append({ "word": word, "stemmed_word": stemmed_word }) stemmed_lexicon = sorted(stemmed_lexicon, key=lambda k: k['word']) if self.verbose: print "\nMerging stemmed duplicates" processed_lexicon = {} length = len(stemmed_lexicon) cnt = 0 for word_dict in stemmed_lexicon: cnt += 1 if word_dict["stemmed_word"] not in processed_lexicon: processed_lexicon[word_dict["stemmed_word"]] = [ word_dict["word"] ] else: processed_lexicon[word_dict["stemmed_word"]].append( word_dict["word"]) if self.verbose: sys.stdout.write("\rStatus: %s / %s" % (cnt, length)) sys.stdout.flush() processed_lexicon = [{ "stemmed_word": key, "word": value } for key, value in processed_lexicon.iteritems()] # sorting dictionaries by word processed_lexicon = sorted(processed_lexicon, key=lambda k: k["stemmed_word"]) return processed_lexicon def load_ner_tags(self, filename): """ load named entity for files """ filename = re.search( "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json", filename).group(1) ner_set = [] with open(os.path.join(self.dst_ne, filename + ".txt"), "rb") as ne_f: tags = csv.reader(ne_f, delimiter=",") for tag in tags: if tag[1] == "FILEINDEX": ner_set.append(set()) else: ner_set[-1].add(tag[0].lower()) return ner_set def render_candidate_lexicon(self): """ render the candidate words """ # filtered by self.frequency_threshold if self.verbose: print "Filtering out frequency lower than frequency_threshold" + "\n" + "-" * 80 self.create_dir(self.dst) pos_tagged_words = [] pos_tagged_words_under_thre = [] for key in self.pos_tagged_statistics: if self.pos_tagged_statistics[key] > self.frequency_threshold: pos_tagged_words.append(key) else: pos_tagged_words_under_thre.append(key) if self.verbose: print "Stemming candidate words" pos_tagged_words = self.stem(pos_tagged_words) pos_tagged_words_under_thre = self.stem(pos_tagged_words_under_thre) ordered_dict_list = [[], []] if self.verbose: print "\nOrganizing candidate words" length = len(pos_tagged_words) for index in range(len(pos_tagged_words)): ordered_dict = OrderedDict() ordered_dict["index"] = index + 1 ordered_dict["count"] = sum([ self.pos_tagged_statistics[w] for w in pos_tagged_words[index]["word"] ]) ordered_dict["stemmed_word"] = pos_tagged_words[index][ "stemmed_word"] ordered_dict["word"] = pos_tagged_words[index]["word"] ordered_dict_list[0].append(NoIndent(ordered_dict)) if self.verbose: sys.stdout.write("\rStatus: %s / %s" % (index + 1, length)) sys.stdout.flush() if self.verbose: print "\nOrganizing candidate words <= frequency threshold" length = len(pos_tagged_words_under_thre) for index in range(len(pos_tagged_words_under_thre)): ordered_dict = OrderedDict() ordered_dict["index"] = index + 1 ordered_dict["count"] = sum([ self.pos_tagged_statistics[w] for w in pos_tagged_words_under_thre[index]["word"] ]) ordered_dict["stemmed_word"] = pos_tagged_words_under_thre[index][ "stemmed_word"] ordered_dict["word"] = pos_tagged_words_under_thre[index]["word"] ordered_dict_list[1].append(NoIndent(ordered_dict)) if self.verbose: sys.stdout.write("\rStatus: %s / %s" % (index + 1, length)) sys.stdout.flush() if self.verbose: print "\n" + "-" * 80 print "Saving data to: \033[1m" + self.dst + "\033[0m" with open(self.dst, "w+") as f_out: f_out.write( json.dumps(ordered_dict_list, indent=4, cls=NoIndentEncoder)) def create_dir(self, new_path): """ create the directory if not exist""" dir1 = os.path.dirname(new_path) if not os.path.exists(dir1): if self.verbose: print "Creating directory: " + dir1 print "-" * 80 os.makedirs(dir1) def run(self): print "Selecting candidate words" + "\n" + "-" * 80 self.stanford_ner() self.get_sentiment_words() self.render_candidate_lexicon() def PrintException(self): exc_type, exc_obj, tb = sys.exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print ' Exception in ({}, LINE {} "{}"): {}'.format( filename, lineno, line.strip(), exc_obj)
def genQuestion(line): """ outputs question from the given text """ bucket = {} # Create an empty dictionary # POS tagging text = CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split()) for i, j in enumerate(text): # text is the parts-of-speach tags in English if j[1] not in bucket: bucket[ j[1]] = i # Add all tags to the dictionary or bucket variable if type(line) is str: # If the passed variable is of type string. line = TextBlob(line) # Create object of type textblob.blob.TextBlob question = '' l1 = ['NNP', 'VBG', 'VBZ', 'IN'] l2 = ['NNP', 'VBG', 'VBZ'] l3 = ['PRP', 'VBG', 'VBZ', 'IN'] l4 = ['PRP', 'VBG', 'VBZ'] l5 = ['PRP', 'VBG', 'VBD'] l6 = ['NNP', 'VBG', 'VBD'] l7 = ['NN', 'VBG', 'VBZ'] l8 = ['NNP', 'VBZ', 'JJ'] l9 = ['NNP', 'VBZ', 'NN'] l10 = ['NNP', 'VBZ'] l11 = ['PRP', 'VBZ'] l12 = ['NNP', 'NN', 'IN'] l13 = ['NN', 'VBZ'] # With the use of conditional statements the dictionary is compared with the list created above if all(key in bucket for key in l1): #'NNP', 'VBG', 'VBZ', 'IN' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?' elif all(key in bucket for key in l2): #'NNP', 'VBG', 'VBZ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?' elif all(key in bucket for key in l3): #'PRP', 'VBG', 'VBZ', 'IN' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['PRP']] + ' ' + line.words[bucket['VBG']] + '?' elif all(key in bucket for key in l4): #'PRP', 'VBG', 'VBZ' in sentence. question = 'What ' + line.words[ bucket['PRP']] + ' ' + ' does ' + line.words[ bucket['VBG']] + ' ' + line.words[bucket['VBG']] + '?' elif all(key in bucket for key in l7): #'NN', 'VBG', 'VBZ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NN']] + ' ' + line.words[bucket['VBG']] + '?' elif all(key in bucket for key in l8): #'NNP', 'VBZ', 'JJ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + '?' elif all(key in bucket for key in l9): #'NNP', 'VBZ', 'NN' in sentence question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + '?' elif all(key in bucket for key in l11): #'PRP', 'VBZ' in sentence. if line.words[bucket['PRP']] in ['she', 'he']: question = 'What' + ' does ' + line.words[bucket['PRP']].lower( ) + ' ' + line.words[bucket['VBZ']].singularize() + '?' elif all(key in bucket for key in l10): #'NNP', 'VBZ' in sentence. question = 'What' + ' does ' + line.words[bucket[ 'NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?' elif all(key in bucket for key in l13): #'NN', 'VBZ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NN']] + '?' # When the tags are generated 's is split to ' and s. To overcome this issue. if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’": question = question.replace(" ’ ", "'s ") # Print the genetated questions as output. if question != '': print('\n', 'Question: ' + question)
'ORTH': 'JJR' }]) nlp = spacy.load('en') matcher = Matcher(nlp.vocab) add_patterns(matcher) tech_pair = ["sortedlist", "sorteddictionary"] tags = [] line = input(">>>") while (line != "/"): flag = False tag_list = [] words = line.split() tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words) if len(words) != len(tagged_words): tagged_words = pos_tag(words) for (word, tag) in tagged_words: # for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split()): if flag: word = "." + word flag = False if tag == "IN" and word in cin: tag_list.append("CIN") elif word in cv: tag_list.append("CV") elif word in tech_pair: tag_list.append("TECH") elif word == ".": flag = True
def main(): information = {} sentences = set() for items in relations[pair]: sentences.add(items[5]) information[items[5]] = (items[0], items[1], items[2], items[4]) sentences = list(sentences) l = len(sentences) corpus = [] topics = [] for sentence in sentences: if pos_flag: words = sentence.split() words[-1] = words[-1].strip() tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words) if len(words) != len(tagged_words): tagged_words = pos_tag(words) # print(tagged_words) # print(sentence.strip()) for phrase in stop_phrases: n = len(phrase) for i in range(len(tagged_words) - n + 1): if phrase == words[i:i+n]: for j in range(i, i+n): tagged_words[j] = (None, tagged_words[j][1]) i = 0 indices = [] keywords = [] for (word, tag) in tagged_words: if word in pair: indices.append(i) keywords.append(word) i += 1 elif word not in stop_words and tag in pos_tag_set and word is not None: keywords.append(word) i += 1 # topics.append(" ".join(keywords)) # topics.append(sentence.strip()) if len(keywords) <= 10 and flag: ws = [w for w in keywords if w not in pair] else: ws = [] # if len(indices) == 2: # for j in range(len(keywords)): # # if j > indices[0] and j <= indices[0] + 4 and keywords[j] not in pair and j < indices[1]: # ws.append(keywords[j]) # elif j >= indices[1] - 2 and j <= indices[1] + 2 and keywords[j] not in pair: # ws.append(keywords[j]) # else: if True: for j in range(len(keywords)): for i in indices: if j >= i - 2 and j <= i + 2 and keywords[j] not in pair and keywords[j] not in ws: ws.append(keywords[j]) break # with open(keywords_path, "a") as keywords_file: # keywords_file.write(",".join(ws)+"\n") # keywords_file.write(sentence+"\n") corpus.append(ws) topics.append(" ".join(ws)) else: corpus.append([w for w in sentence.split() if w not in stop_words]) if query_flag: with open(os.path.join(os.pardir, "keywords", "corpus.pkl"), 'wb') as corpus_file: pickle.dump(corpus, corpus_file) with open(os.path.join(os.pardir, "keywords", "sentences.pkl"), 'wb') as sentences_file: pickle.dump(sentences, sentences_file) else: # Prepare word2vector model fname = os.path.join(os.pardir, "data", "mymodel") model = gensim.models.Word2Vec.load(fname) model.init_sims(replace=True) # Build weighted graph # dictionary = Dictionary(corpus) # bow_corpus = [dictionary.doc2bow(document) for document in corpus] index = WmdSimilarity(corpus, model) def set_shreshold(a, b): if ver_flag: if a == b: return 0.52 return 0.55 - 0.05 ** abs(a - b) else: if a == b: return 0.55 elif a > 3 or b > 3: return 0.55 - 0.1 ** abs(a - b) return 0.55 - 0.05 ** abs(a - b) G = nx.Graph() for i in range(l - 1): sims = index[corpus[i]] # print("query:") # print(corpus[i]) # print(sentences[i]) # print("sims:") for j in range(i + 1, l): # print(sims[j]) # print(corpus[j]) # print(sentences[j]) # print() shreshold = set_shreshold(len(corpus[i]), len(corpus[j])) if sims[j] >= shreshold: if i not in G: G.add_node(i) if j not in G: G.add_node(j) G.add_edge(i, j) # G.add_edge(i, j, weight=sims[j]) out_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.txt".format("&".join(pair), G.number_of_nodes(), l)) image_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.png".format("&".join(pair), G.number_of_nodes(), l)) # Draw graph pos = nx.spring_layout(G) plt.figure(figsize=(19,12)) plt.axis('off') nx.draw_networkx_nodes(G, pos, node_size=50) nx.draw_networkx_edges(G, pos, width=0.75) plt.savefig(image_path) # plt.show() nnodes = G.number_of_nodes() if nnodes < 4: communities = [] communities.append(G.nodes()) elif nnodes <= 15: communities_generator = community.girvan_newman(G) temp_communities = next(communities_generator) communities = sorted(map(sorted, temp_communities)) else: if nnodes < 50: part = 2 / 3 else: part = 1 / 3 # Detect communities communities_generator = community.girvan_newman(G) div_flag = True while div_flag: temp_communities = next(communities_generator) communities = sorted(map(sorted, temp_communities)) div_flag = False for com in communities: if len(com) > l * part: div_flag = True break num = 0 graph_indices = set() bloblist = [] clusters = [] for com in communities: if len(com) > 1: doc = "" for i in com: doc += topics[i] + " " bloblist.append(tb(doc)) clusters.append(com) with open(out_path, "a") as out_file: for i, blob in enumerate(bloblist): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) word_num = 0 for word, score in sorted_words: if word_num == 3: break if tf(word, blob) >= 0.2: word_num += 1 out_file.write(word+", ") print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) out_file.write("---------------------------------------------------\n\n") for j in clusters[i]: out_file.write(",".join(corpus[j])+"\n") out_file.write(sentences[j]+"\n") graph_indices.add(j) num += 1 out_file.write("other---------------------------------------------------\n\n") for j in range(len(sentences)): if j not in graph_indices: out_file.write(",".join(corpus[j])+"\n") out_file.write(sentences[j]+"\n")
from nltk.tag.stanford import CoreNLPPOSTagger from nltk.tag.stanford import CoreNLPNERTagger from nltk.stem.wordnet import WordNetLemmatizer url = "http://localhost:9000/tregex" request_params = {"pattern": " NP=n1 !>> NP >> (VP > (S=n2 > ROOT)) "} text = "John would have loved Anne." print(text) r = requests.post(url, data=text, params=request_params) json_data = json.loads(r.text) text1 = json_data['sentences'][0]['0']['match'] tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0]) line = tree.leaves() text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split()) c = 0 for tagg in text_pos: #line = 'John loved Anne.' if tagg[1] == "VBD" and text_pos[c][0] != 'had' and text_pos[ c + 1][1] != 'VBG': verb_tense = "did" root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v') text = text.replace(tagg[0], root_verb) #line = 'John love Anne.' if tagg[1] == "VBP" and text_pos[c][0] != 'is' and text_pos[c][ 0] != 'are' and text_pos[c][0] != 'have': verb_tense = "do" root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v') text = text.replace(tagg[0], root_verb)
from nltk.tag.stanford import CoreNLPPOSTagger from nltk.parse.corenlp import CoreNLPDependencyParser import spacy from spacy.matcher import Matcher # dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') # # while True: # parse, = dep_parser.raw_parse(input(">>>")) # # for governor, dep, dependent in parse.triples(): # print(governor, dep, dependent) line = input(">>>") print(line.split(" ")) for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag( line.split(" ")): print(word) print(tag) print(tag)
def modify(self): url = "http://localhost:9000/tregex" request_params = {"pattern": " NP=n1 !>> NP >> (VP > (S=n2 > ROOT)) " } text = self.text r = requests.post(url, data=text, params=request_params) json_data = json.loads(r.text) text1 = json_data['sentences'][0]['0']['match'] tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0]) line = tree.leaves() text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split()) c = 0 for tagg in text_pos: #line = 'John loved Anne.' if tagg[1]== "VBD" and text_pos[c][0]!= 'had' and text_pos[c+1][1]!='VBG': verb_tense = "did" root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v') text = text.replace(tagg[0],root_verb) #line = 'John love Anne.' if tagg[1]=="VBP" and text_pos[c][0]!= 'is' and text_pos[c][0]!='are' and text_pos[c][0]!= 'have': verb_tense = "do" root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v') text = text.replace(tagg[0],root_verb) #line = 'John loves Anne.' if tagg[1]=="VBZ" and text_pos[c+1][1]!='VBN' and text_pos[c+1][1]!='VBG': verb_tense = "does" root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v') text = text.replace(tagg[0],root_verb) #line = 'John is playing with Anne' #who is john playig with? #line = 'John was playing with Anne.' #who was john playing with? #line = 'John is going to play with Anne.' #who is john going to play with? if tagg[1]=="VBG" and text_pos[c-1][1]!='VB' and text_pos[c-1][1]!='VBN': verb_tense = text_pos[c-1][0] text = text.replace(text_pos[c-1][0]+" ","") #line = 'John has loved Anne.' if tagg[1]=="VBZ" and text_pos[c+1][1]=='VBN' and text_pos[c+2][1]!="VBG": verb_tense = text_pos[c][0] text = text.replace(text_pos[c][0]+" ","") #line = 'John will be playing with Anne.' if tagg[1]=="VBG" and text_pos[c-1][1]=='VB': verb_tense = text_pos[c-2][0] text = text.replace(text_pos[c-2][0]+" ","") #line = 'John has been playing with Anne.' #line = 'John had been playing with Anne.' if (tagg[1]=="VBZ" or tagg[1]=="VBD") and text_pos[c+1][1]=='VBN' and text_pos[c+2][1]== 'VBG': verb_tense = text_pos[c][0] text = text.replace(text_pos[c][0]+" ","") #line = 'John had loved Anne.' #line = 'We have loved Anne.' if tagg[1]=="VBN" and tagg[0]!='been' and (text_pos[c-1][0]== 'had' or text_pos[c-1][0]== 'have') and text_pos[c-2][1]!='MD': verb_tense = text_pos[c-1][0] text = text.replace(text_pos[c-1][0]+" ","") #line = 'John will have played with Anne.' #line = 'John will love Anne.' #line = 'John would have loved Anne.' if tagg[1]=="MD" and text_pos[c+1][1]=='VB': verb_tense = text_pos[c][0] text = text.replace(text_pos[c][0]+" ","") c = c + 1 obj = ' '.join(line) classified_text = CoreNLPNERTagger(url='http://localhost:9000').tag(obj.split()) f = 1 for i in classified_text: if i[1]!='PERSON': f = 0 break text = text.replace(obj,'') text = text.replace("."," ?") Q = 'Who '+verb_tense+' '+text return Q