class Parser(): def __init__(self): #corenlp_dir = "/export/data/ghpaetzold/simpatico/server_simplifiers/core_nlp/stanford-corenlp-full-2016-10-31/" corenlp_dir = "/export/data/cscarton/simpatico/stanford-corenlp-full-2016-10-31/" self.corenlp = StanfordCoreNLP(corenlp_dir, memory="4g", properties='galician.myproperties.properties') def process(self, sentence): #sentences = open(self.doc, "r").read().strip().split("\n") #sentences = [l.strip().split(' ') for l in f_read] #dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") return self.corenlp.raw_parse(sentence)['sentences'][0] def transform(self, parsed): dict_dep = {} for rel, _, head, word, n in parsed['dependencies']: n = int(n) head = int(head) if head not in dict_dep.keys(): dict_dep[head] = {} if rel not in dict_dep[head].keys(): dict_dep[head][rel] = [] dict_dep[head][rel].append(n) return dict_dep
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging and dependency parse. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self, corenlp_dir): self.parser = StanfordCoreNLP(corenlp_dir) def parse(self, sent): """ Part-Of-Speech tagging and dependency parse. :param sent: string :return: a list of tuple (word, pos, dependency) """ result = self.parser.raw_parse(sent) tuples = [] for s in result['sentences']: word, pos, dependency = [], [], [] for dep in s['dependencies']: dependency.append({ 'type': dep[0], 'dep': int(dep[2]) - 1, 'gov': int(dep[4]) - 1 }) for w in s['words']: word.append(w[0]) pos.append(w[1]['PartOfSpeech']) tuples.append((word, pos, dependency)) return tuples
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging and dependency parse. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self, corenlp_dir): self.parser = StanfordCoreNLP(corenlp_dir) def parse(self, sent): """ Part-Of-Speech tagging and dependency parse. :param sent: string :return: a list of tuple (word, pos, dependency) """ result = self.parser.raw_parse(sent) tuples = [] for s in result['sentences']: word, pos, dependency = [], [], [] for dep in s['dependencies']: dependency.append({'type': dep[0], 'dep': int(dep[2])-1, 'gov': int(dep[4])-1}) for w in s['words']: word.append(w[0]) pos.append(w[1]['PartOfSpeech']) tuples.append((word, pos, dependency)) return tuples
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self, corenlp_dir): self.parser = StanfordCoreNLP(corenlp_dir) #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar') def parse(self, sent): """ Part-Of-Speech tagging :param sent: string :return: a list of tuple (tokens, pos) """ """ tokens = [] pos = [] result = self.parser.tag(sent.split()) for entry in result: tokens.append(entry[0]) pos.append(entry[1]) tuples = [tokens, pos] return tuples """ result = self.parser.raw_parse(sent) tuples = [] word, pos = [], [] for s in result['sentences']: for w in s['words']: word.append(w[0]) pos.append(w[1]['PartOfSpeech']) pattern = re.compile('\[Text=') tokenpattern = re.compile('\[Text=[^\s]+\s') pospattern = re.compile('PartOfSpeech=[^\s]+\s') startIdxed = [] for t in re.finditer(pattern, s['parsetree']): startIdxed.append(t.start()) for i in range(len(startIdxed)): start = startIdxed[i] if i < len(startIdxed) - 1: end = startIdxed[i+1] else: end = -1 token = s['parsetree'][start:end] text = re.findall(tokenpattern, token) partOfSpeech = re.findall(pospattern, token) word.append(text[0][6:-1]) pos.append(partOfSpeech[0][13:-1]) tuples.append((word, pos)) #print tuples return tuples
def sentToParse(Res, num_sents): # load corenlp sys.path.insert(0, osp.join(ROOT_DIR, 'pyutils', 'corenlp')) from corenlp import StanfordCoreNLP parser_path = osp.join(ROOT_DIR, 'pyutils', 'corenlp', 'stanford-corenlp-full-2015-01-30') stanfordParser = StanfordCoreNLP(parser_path) num_sents = len(Res) if num_sents < 0 else num_sents print 'stanford parser loaded.' # start parsing num_sents = len(Res) if num_sents < 0 else num_sents for i in range(num_sents): ref_id, sent = Res[i]['ref_id'], Res[i]['sent'] parse = stanfordParser.raw_parse(sent)['sentences'][0] Res[i]['parse'] = parse print '%s/%s sent is parsed.' % (i + 1, num_sents)
class StringProcessor(object): """Tokenize or parse a string. """ def __init__(self, project): """Instantiate and ready the parser. Note that readying the parser takes some time. """ self.parser = StanfordCoreNLP(app.config["CORE_NLP_DIR"]) self.project = project logger = logging.getLogger(__name__) global project_logger project_logger = ProjectLogger(logger, project) def tokenize(self, txt): """Turn a string of one or more ``Sentence``\s into a list of ``Sentence`` objects. This method will also tokenize each word in txt, find its PoS, lemma, and space_before. :param str txt: One or more sentences, in a string format. :return list: A list of document.Sentence objects. """ sentences = [] for sentence_text in split_sentences(txt): sentence = self.parse_with_error_handling(sentence_text) sentences.extend(tokenize_from_raw(sentence, sentence_text, self.project)) return sentences def parse(self, sentence, relationships=None, dependencies=None, max_length=30): """Parse a ``Sentence`` and extract dependencies, parse trees, etc. Note that for max_length, a "word" is defined as something with a space on at least one side. This is not the typical definition of "word". This is done so that length can be checked before resources are committed to processing a very long sentence. :param Sentence sentence: The ``Sentence`` object. :param int max_length: The most amount of words to process. """ parsed = self.parse_with_error_handling(sentence.text) # If the parse was unsuccessful, exit if parsed == None: return parsed_sentence = parsed["sentences"][0] if len(parsed["sentences"]) > 1: project_logger.warning("More than one sentence passed in to" " StringProcessor.parse().") parsed_sentence["text"] += parsed["sentences"][1]["text"] for dependency in parsed_sentence["dependencies"]: # We don't want to make a dependency involving ROOT if int(dependency[2]) > 0 and int(dependency[4]) > 0: governor = dependency[1] dependent = dependency[3] governor_index = int(dependency[2]) - 1 dependent_index = int(dependency[4]) - 1 governor_pos = parsed_sentence["words"][governor_index][1]\ ["PartOfSpeech"] governor_lemma = parsed_sentence["words"][governor_index][1]\ ["Lemma"] dependent_pos = parsed_sentence["words"][dependent_index][1]\ ["PartOfSpeech"] dependent_lemma = parsed_sentence["words"][dependent_index][1]\ ["Lemma"] grammatical_relationship = dependency[0] # If dictionaries are present, run with duplication handling if relationships != None and dependencies != None: key = grammatical_relationship if key in relationships.keys(): relationship = relationships[key] else: try: relationship = GrammaticalRelationship.query.\ filter_by(name = grammatical_relationship).\ one() except(MultipleResultsFound): project_logger.error("duplicate records found " "for: %s", str(key)) except(NoResultFound): relationship = GrammaticalRelationship( name = grammatical_relationship) relationships[key] = relationship # Read the data for the governor, and find the # corresponding word governor = Word.query.filter_by( word = governor, lemma = governor_lemma, part_of_speech = governor_pos ).first() # Same as above for the dependent in the relationship dependent = Word.query.filter_by( word = dependent, lemma = dependent_lemma, part_of_speech = dependent_pos ).first() try: governor.id dependent.id except: project_logger.error("Governor or dependent not " "found; giving up on parse. This likely indicates" " an error in the preprocessing; rerunning the " "preprocessor is recommended.") project_logger.info(sentence) return sentence key = (relationship.name, governor.id, dependent.id) if key in dependencies.keys(): dependency = dependencies[key] else: try: dependency = Dependency.query.filter_by( grammatical_relationship = relationship, governor = governor, dependent = dependent ).one() except(MultipleResultsFound): self.logg_error(("duplicate records found for: %s", str(key))) except(NoResultFound): dependency = Dependency( grammatical_relationship = relationship, governor = governor, dependent = dependent ) dependencies[key] = dependency # Add the dependency to the sentence sentence.add_dependency( dependency = dependency, governor_index = governor_index, dependent_index = dependent_index, project = self.project, force = False ) dependency.save(False) else: # TODO: fill pass return sentence def parse_with_error_handling(self, text): """Run the parser and handle errors properly. Also checks the sentence text for irregularities that may break the parser and handles it before proceeding. Any failure will cause this method to return None :param str text: The text of the sentence to check """ # Check for non-string if not isinstance(text, str) and not isinstance(text, unicode): project_logger.warning("Parser got a non-string argument: %s", text) return None # Check for non-unicode if not isinstance(text, unicode): # Try to convert the string to unicode if possible # Unit test: should fail with this example: # http://stackoverflow.com/questions/6257647/convert-string-to-unicode try: text = unicode(text) except(UnicodeDecodeError): project_logger.warning("The following sentence text is " "not unicode; convertion failed.") project_logger.info(text) # Skip sentence if flag is True if app.config["SKIP_SENTENCE_ON_ERROR"]: return None else: # Try to parse the sentence anyway project_logger.warning("Attempting to parse " "non-unicode sentence.") # Check for empty or nonexistent text if text == "" or text == None: return None # Check for irregular characters # TODO: what are considered irregular characters? # Try to parse, catch errors parsed_text = None try: parsed_text = self.parser.raw_parse(text) # TODO: handle all errors properly # ProcessError, TimeoutError, OutOfMemoryError except TimeoutError as e: project_logger.error("Got a TimeoutError: %s", str(e)) return None except ProcessError as e: project_logger.error("Got a ProcessError: %s", str(e)) return None except: project_logger.error("Unknown error") return None # Parse successful, return parsed text return parsed_text
corenlp_dir = "/NLP_TOOLS/tool_sets/stanford-corenlp/stanford-corenlp-full-2015-04-20/" parser = StanfordCoreNLP(corenlp_dir) print("Stanford loaded") tree_re = re.compile(r"\(ROOT.*") cachedAligned = [] for aligned in pickleFile: if aligned is None: continue text = unicode(str(aligned), errors='replace').encode('ascii', 'ignore') # try: results = parser.raw_parse(text) aligned.tree = [] aligned.dependencies = [] for s in results['sentences']: aligned.tree.append(tree_re.search(s['parsetree']).group(0)) aligned.dependencies += s['dependencies'] except: print(text) print( "Unexpected error:", sys.exc_info()[0]) cachedAligned.append(aligned) if len(cachedAligned) % 10 == 0:
class MyExtract(object): ''' classdocs ''' def __init__(self): ''' constructor ''' self.rawcorpus = None self.corpus = [] self.pars = [] self.wordspace = None self.docspace = None self.stop = set(stopwords.words('english')) self.parser = None self.prelations = [] self.nrelations = [] def buildRawCorpus(self, myfile): ''' extract text from xml files ''' corpus = "" for txtfile in glob.glob(devdata + myfile): print "reading " + txtfile xmldoc = minidom.parse(txtfile) itemlist = xmldoc.getElementsByTagName('text') for s in itemlist: text = s.firstChild.data if "." in text: corpus = corpus + " " + text self.rawcorpus = corpus.encode("utf-8") def buildCorpus(self): ''' preprocess raw text (tokenize, remove stopwords) ''' sents = self.rawcorpus.split(".") for sent in sents: toks = [ w.lower() for w in nltk.word_tokenize(sent.decode('utf-8')) if w.lower() not in self.stop ] self.corpus.append(toks) def tokenizeAbs(self, parag): ''' preprocess raw text (tokenize, remove stopwords) ''' toks = [ w.lower() for w in nltk.word_tokenize(parag) if w.lower() not in self.stop ] return toks def buildRawSents(self, myfile): for txtfile in glob.glob(devdata + myfile): xmldoc = minidom.parse(txtfile) itemlist0 = xmldoc.getElementsByTagName('document') count = 0 for it0 in itemlist0: parag = "" itemlist = it0.getElementsByTagName('text') for item in itemlist: if '.' in item.firstChild.data: parag = parag + " " + item.firstChild.data toks = self.tokenizeAbs(parag.encode("utf-8").decode('utf-8')) lab = [txtfile + '_' + ` count `] self.pars.append(doc2vec.LabeledSentence(words=toks, tags=lab)) count = count + 1 def exploreCDRCorpus(self, myfile, maxsize): ''' extract entities + relations from xml ''' diseases = {} chemicals = {} relations = [] xmldoc = minidom.parse(myfile) itemlist0 = xmldoc.getElementsByTagName('document') count = 0 for it0 in itemlist0: print "\t- processing abstract " + ` count ` parsed = self.docspace.docvecs[myfile + "_" + ` count `] itemlist1 = it0.getElementsByTagName('annotation') print "\t\t+ " + ` len(itemlist1) ` + " entities" for it1 in itemlist1: itemlist2 = it1.getElementsByTagName('infon') typ = itemlist2[0].firstChild.data mesh = itemlist2[len(itemlist2) - 1].firstChild.data text = it1.getElementsByTagName( 'text')[0].firstChild.data.lower() codes = mesh.split('|') for code in codes: ent = MyEntity(text, code, typ) if (typ == 'Chemical'): chemicals[code] = ent if (typ == 'Disease'): diseases[code] = ent itemlist3 = it0.getElementsByTagName('relation') print "\t\t+ " + ` 2 * len( itemlist3) ` + " positive and negative relations" print "\t\t\t* extracting features for positive relations" print "\t\t\t* extracting features for negative relations" for it3 in itemlist3: itemlist4 = it3.getElementsByTagName('infon') key1 = itemlist4[1].firstChild.data key2 = itemlist4[2].firstChild.data e1 = chemicals[key1] e2 = diseases[key2] e1.bow = self.avgBOW(e1.text) e2.bow = self.avgBOW(e2.text) rel = MyRelation(e1, e2, '1') rel.abs = parsed self.prelations.append(rel) relations.append(key1 + "_" + key2) num = 0 for key1 in chemicals.keys(): for key2 in diseases.keys(): if key1 + "_" + key2 not in relations: if num < len(itemlist3): e1 = chemicals[key1] e2 = diseases[key2] e1.bow = self.avgBOW(e1.text) e2.bow = self.avgBOW(e2.text) rel = MyRelation(e1, e2, '-1') rel.abs = parsed self.nrelations.append(rel) num = num + 1 count = count + 1 if (count == maxsize): break def exploreDDICorpus(self, myfile, maxsize, ftyp): ''' extract entities + relations from xml ''' #print(myfile) xmldoc = minidom.parse(myfile) itemlist0 = xmldoc.getElementsByTagName('document') count = 0 for it0 in itemlist0: # abstract with annotations print "\t- processing abstract " + ` count ` drugs = {} # entities itemlist1 = it0.getElementsByTagName('annotation') print "\t\t+ " + ` len(itemlist1) ` + " entities" for it1 in itemlist1: itemlist2a = it1.getElementsByTagName('infon') typ = itemlist2a[0].firstChild.data print typ itemlist2b = it1.getElementsByTagName('text') text = itemlist2b[0].firstChild.data.lower() print text ent = MyEntity(text, "", typ) ent.bow = self.avgBOW(ent.text) drugs[text] = ent # abstract itemlist3 = it0.getElementsByTagName('text') abstract = "" for it3 in itemlist3: if (len(it3.firstChild.data.split()) > 3): abstract = abstract + it3.firstChild.data # parse abstract parsed = self.parseSentence(abstract) #stanford docvec = self.docspace.docvecs[myfile + "_" + ` count `] #doc2vec #print len(drugs.keys()) if (len(drugs.keys()) > 1): e1 = drugs[drugs.keys()[0]] e2 = drugs[drugs.keys()[1]] e1.bow = self.avgBOW(e1.text) e2.bow = self.avgBOW(e2.text) #print(ftyp) if (ftyp == "positive"): #print(parsed) rel = MyRelation(e1, e2, '1') rel.abs = docvec rel.parse = parsed.encode("utf-8") self.prelations.append(rel) if (ftyp == "negative"): #print(docvec) rel = MyRelation(e1, e2, '-1') rel.abs = docvec rel.parse = parsed.encode("utf-8") self.nrelations.append(rel) # increment counter count = count + 1 if (count == maxsize): break def avgBOW(self, entity): bow = [] ents = entity.split(" ") i = 0 while i < self.wordspace.layer1_size: v = 0 for ent in ents: if ent in self.wordspace.vocab: v = v + self.wordspace[ent][i] bow.append(v / len(ents)) i = i + 1 return np.array(bow) def buildWordSpace(self, modelfile): ''' compute distributional model ''' model = Word2Vec(self.corpus, min_count=1, size=20, iter=100, workers=4) model.save(modelfile) self.wordspace = model def buildDocSpace(self, modelfile): ''' compute distributional model ''' model = doc2vec.Doc2Vec(self.pars, min_count=5, size=20, iter=100, workers=4) model.save(modelfile) self.docspace = model def loadWordSpace(self, modelfile): ''' compute distributional model ''' self.wordspace = Word2Vec.load(devdata + modelfile) def loadDocSpace(self, modelfile): ''' compute distributional model ''' self.docspace = doc2vec.Doc2Vec.load(devdata + modelfile) def loadParser(self): corenlp_dir = os.environ['STANFORD'] self.parser = StanfordCoreNLP(corenlp_dir + "/") # wait a few minutes... def parseSentence(self, sentence): parsed = self.parser.raw_parse(sentence)['sentences'][0]['parsetree'] return parsed
paragraph = "" for d in searcher.documents(episode=e): outfile.writelines(d['line'].encode('utf-8')+' ') # outfile.writelines((d['speaker']+': '+d['line']).encode('utf-8')+' ') # paragraph += d['speaker']+': '+d['line']+' ' # # paragraph += re.sub(r'\([^)]*\)', '',d['line'])+' ' # paragraph = paragraph.replace('\n','').replace(' ',' ') # outfile.writelines(paragraph.encode('utf-8')) outfile.close() parsed = [] corenlp_dir = "stanford-corenlp-full-2014-08-27" corenlp = StanfordCoreNLP(corenlp_dir) for e in episodeNum: for d in searcher.documents(episode=e): parsed.append(corenlp.raw_parse(d)) # sentClient = StanfordNLPSentimentClient('http://localhost:8080') # sentiment = [] # for t in text: # sentiment.append(sentClient.classify(t)) # mask = imread("friends.gif") wc = WordCloud(max_words=30,stopwords=STOPWORDS|{'s','t','m','re','oh','right','don','know','well','hey','gonna','okay','yeah','go','really','think','hi','uh','look','god','mean','one','ye','guy','y','got','come','now'},font_path='/Users/elaine/Library/Fonts/Berlin.ttf') for c in mainChars: wc.generate(lines[uniqueSpeakers.index(c)]) wc.to_file(c+".png") # wc = WordCloud(background_color="white",max_words=50,mask=mask,stopwords=STOPWORDS|{'s','t','m','re','oh','right','don','know','well','hey','gonna','okay','yeah','go','really','think','hi','uh','look','god','mean','one','ye','guy','y','got','come','now'},font_path='/Users/elaine/Library/Fonts/Berlin.ttf') # for c in mainChars: # wc.generate(lines[uniqueSpeakers.index(c)])
def stanfordParse(text, corenlpDir='corenlp/stanford-corenlp-full-2014-01-04'): global stanford if stanford is None: stanford = StanfordCoreNLP(corenlpDir) return stanford.raw_parse(text)
class Nlp_persistence(object): """Persistence layer for having fast access to information produced by the StanfordCoreNLP tool.""" def __init__(self, fallback=False): self.FILE = "nlp_infos.p" self.data = None self.data_length = None self.corenlp_dir = "helper/stanfordnlp/corenlp-python/stanford-corenlp-full-2013-11-12/" if fallback: try: self.corenlp = StanfordCoreNLP(self.corenlp_dir) except TIMEOUT: print "Stanford CoreNLP Timeout" def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def close(self): # When exiting, update pickle file with new sentences and kill StanfordCoreNLP before so we definitely have enough memory for that try: del(self.corenlp) except AttributeError: # There was a timeout pass # Write only if we added something to self.data if self.data_length < len(self.data): self._write() def create_persistence(self, relations): try: # Trying to load data data = pickle.load(open(self.FILE, "rb")) except (IOError, EOFError): # No data so far print "Could not open cache. Create new." logging.info("Could not find %s. Create new data.", self.FILE) data = {} # Create nlp information for all relevant sentences for relation in relations: if not relation.source.sentence in data: self._update_data(relation.source, data) else: print "Sentence is already in data" if not relation.target.sentence in data: self._update_data(relation.target, data) else: print "Sentence is already in data" print "Done!" logging.info("Successfully loaded all nlp information to persistence file.") # Save data to a file pickle.dump(data, open(self.FILE, "wb"), protocol=-1) def _update_data(self, entity, data): sentence_obj = entity.sentence try: tree = self._get_tree(sentence_obj) except RPCInternalError: logging.error("Could not process the following sentence from text %s: %s", sentence_obj.filename, sentence_obj.text) # Return without updating data return print "--- " + sentence_obj.filename print sentence_obj.text data.update({sentence_obj: tree}) def load(self): data = {} if self.data is None: try: data = pickle.load(open(self.FILE, "rb")) except (IOError, EOFError): logging.warning("No cached nlp data.") finally: self.data = data self.data_length = len(data) else: # Data is already there - there is nothing to do pass def get_info_for_sentence(self, sentence): if type(self.data) is dict: try: return self.data[sentence] except KeyError: logging.error("Nlp_persistence: This sentence is not a key/Is not available in the Nlp persistence layer.") logging.info("Nlp_persistence fallback to CoreNLP server") # Fallback: Try to get tree from CoreNLP server tree = self._get_tree(sentence) # Drive by caching self.data.update({sentence: tree}) return tree else: logging.error("You have to use Nlp_persistence.load() before you can get the information of a sentence") return None def get_collapsed_dependencies(self, sentence): info = self.get_info_for_sentence(sentence) return info['sentences'][0]['dependencies'] def get_parse_tree(self, sentence): info = self.get_info_for_sentence(sentence) return info['sentences'][0]['parsetree'] def _write(self): # Save data to a file pickle.dump(self.data, open(self.FILE, "wb")) def _get_tree(self, sentence): tree = self.corenlp.raw_parse(sentence.text) return tree def get_pos_tag_for_word(self, sentence, word): """Returns the POS tag for a word in a sentence. If the word is not in the sentence raise WordNotInSentence error.""" info_sentence = self.get_info_for_sentence(sentence) words = info_sentence['sentences'][0]['words'] for w in words: if w[0] in word: return w[1]["PartOfSpeech"] else: raise PosTagNotFound(sentence, word) def get_lemma_for_word(self, sentence, word): """Returns the lemma for a word in sentence.""" info_sentence = self.get_info_for_sentence(sentence) words = info_sentence['sentences'][0]['words'] for w in words: if w[0] in word: return w[1]["Lemma"] else: raise LemmaNotFound(sentence, word) def is_main_verb(self, sentence, word): """Returns true if word is a main verb of sentence and not an aux.""" info_sentence = self.get_info_for_sentence(sentence) dependencies = info_sentence['sentences'][0]['dependencies'] for dependency in dependencies: if dependency[0] == "aux" and dependency[2] == word: return False else: return True def get_all_aux_for_verb(self, sentence, verb): """Returns all distinct aux for verb as strings in order of the sentence.""" info_sentence = self.get_info_for_sentence(sentence) dependencies = info_sentence['sentences'][0]['dependencies'] aux = [] for dependency in dependencies: if (dependency[0] == "aux" or dependency[0] == "auxpass") and dependency[1] == verb: aux.append(dependency[2]) return aux def get_verb_for_aux(self, sentence, aux): """Returns the governing verb for the aux as string.""" info_sentence = self.get_info_for_sentence(sentence) dependencies = info_sentence['sentences'][0]['dependencies'] for dependency in dependencies: if dependency[0] == "aux" and dependency[2] == aux: return dependency[1] else: raise AuxNotFound(aux) def find_all_verb_pos_tags(self, sentence, verb): """Returns all pos tags for all verbs based on the dependencies relation of the sentence.""" if self.is_main_verb(sentence, verb): # verb is not an aux main_verb = verb else: # verb is aux (this should normally not happen due to the data) main_verb = self.get_verb_for_aux(sentence, verb) auxes = self.get_all_aux_for_verb(sentence, main_verb) verb_pos = self.get_pos_tag_for_word(sentence, main_verb) aux_pos = map(lambda aux: self.get_pos_tag_for_word(sentence, aux), auxes) return aux_pos + [verb_pos] def get_governing_verb(self, event): sentence = event.sentence # info = [verb, aux, pos verb, pos aux, index_of_verb] info = self.get_info_on_governing_verb(event.text, event.index, sentence) if info is None: raise CouldNotFindGoverningVerb else: if info[0] is None: raise CouldNotFindGoverningVerb else: return (info[0], info[4]) def is_root(self, event): sentence = event.sentence info_sentence = self.get_info_for_sentence(sentence) collapsed_dependencies = info_sentence['sentences'][0]['dependencies'] for dependency in collapsed_dependencies: dependency_type = dependency[0] dependent = dependency[2] if dependency_type == "root" and dependent == event.text: return True else: return False def get_info_on_governing_verb(self, non_verb, index, sentence): """This method returns information about the governing verb of a non-verb. It returns an array with the following format: [verb, aux, POS of verb, POS of aux, index_of_verb] """ info = self.get_info_for_sentence(sentence) if info: # Search for non_verb governing_verb, index = self._get_governing_verb(non_verb, index, info) info_on_governing_verb = [governing_verb, None, None, None, index] # Set POS of main verb pos_verb = self._get_pos_of_verb(governing_verb, info) info_on_governing_verb[2] = pos_verb # Searching for an Aux for the governing verb aux = self._get_aux_of_verb(governing_verb, info) info_on_governing_verb[1] = aux # If there is an aux, get it's POS if aux: pos_aux = self._get_pos_of_verb(aux, info) info_on_governing_verb[3] = pos_aux return info_on_governing_verb else: return None def _get_aux_of_verb(self, verb, info): dependencies = info['sentences'][0]['dependencies'] sources = [x[1] for x in dependencies] # Find index of verb in targets index = None for i, source in enumerate(sources): if source == verb and dependencies[i][0] == "aux": index = i # Get aux if index is None: # Not every verb has an aux return None else: aux = dependencies[index][2] return aux def _get_pos_of_verb(self, verb, info): info_on_words = info['sentences'][0]['words'] for word in info_on_words: if word[0] == verb: return word[1]['PartOfSpeech'] def _find_governing_word(self, word, dependencies): for dependency in dependencies: if dependency[2] == word: return dependency[1] else: return None def _find_governing_word_index(self, word, index, index_dependencies): word = word + "-" + str(index) for dependency in index_dependencies: if dependency[2] == word: # Remove governor with index appended return dependency[1] else: return None def _remove_index_from_token(self, token): if token: token = token.split("-")[:-1] return "-".join(token) else: return None def _get_index_from_token(self, token): if token: index = token.split("-")[-1] return index else: return None def _get_governing_verb(self, non_verb, index, info): index_dependencies = info['sentences'][0]['indexeddependencies'] # Try to find a governor for non_verb governor = self._find_governing_word_index(non_verb, index, index_dependencies) # Search through tree as long we find a verb and until we can go further up while not self._is_verb(self._remove_index_from_token(governor), info) and governor is not None: old_governor = governor governor = self._find_governing_word_index(self._remove_index_from_token(governor), self._get_index_from_token(governor), index_dependencies) if governor == old_governor: # Detected circle (does not happen often, but happens. Not sure why.) governor = None break if governor: # Remove index from governor string return (self._remove_index_from_token(governor), int(self._get_index_from_token(governor))) else: # Examples when this is allowed to happen: # Example for when it happens: "And in Hong Kong, a three percent drop." <- no verb # Other example: "One exception was the swine flu pandemic of 2009-2010, when 348 children died." and "pandemic". "pandemic" is the root of the sentence and is not governed by anything # Other corner case: "And the dominant flu strain early in the season was one that tends to cause more severe illness." for "season" raise CouldNotFindGoverningVerb(non_verb, index) def _is_verb(self, text, info): """Checks if text has the POS tag of a verb.""" if not text: return False words = info['sentences'][0]['words'] for word in words: if word[0] == text: if word[1]['PartOfSpeech'] in ['VBG', 'VBD', 'VB', 'VBN', 'VBP', 'VBZ']: return True return False
class StanforExtractor(object): def __init__(self): corenlp_dir = "corenlp-python/stanford-corenlp-full-2014-08-27/" self.corenlp = StanfordCoreNLP(corenlp_dir) # wait a few minutes... print("corenlp object initiated") def tag_text(self, text): """ :param text: :return: """ assert type(text) == str sents = self.corenlp.raw_parse(text) return sents def expand_rels_double(self, rel_words, sent): """ :param rel_words: [wrd1,wrd2] :param sent: in tagged_text['sentences'], ['dependencies'] for each sent :return: """ assert type(rel_words) == list assert type(sent) == list assert len(rel_words) == 2 rel_tmp = [rel_words[0], rel_words[1]] for rel_1 in sent: if rel_1[1] == rel_words[0] and rel_1[2] == rel_words[1]: continue rel_1 = list(rel_1) # print(rel_1) # if prep_ or prepc_ is the tag # appos_tag = 1 neg_tag = 0 if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"): middle_word = rel_1[0][rel_1[0].find("_") + 1 :] rel_1 = [rel_1[1], middle_word, rel_1[2]] elif rel_1[0] == u"appos": rel_1 = [rel_1[1], rel_1[2]] # appos_tag = -1 elif rel_1[0] == u"neg": # neg_tag = 1 rel_1 = [rel_1[1], rel_1[2]] else: continue # rel_1 = [rel_1[1],rel_1[2]] if rel_words[0] in rel_1: append_start = 1 rel_1.remove(rel_words[0]) elif rel_words[1] in rel_1: append_start = -1 rel_1.remove(rel_words[1]) else: continue # append_start = append_start*appos_tag # if neg_tag == 1: # if append_start == 1: rel_tmp = [" ".join(rel_1)] + rel_tmp else: rel_tmp = rel_tmp + [" ".join(rel_1)] return rel_tmp def expand_rels_wordlist(self, rel_words, sent): """ :param rel_words: [wrd1,wrd2,..] :param sent: in tagged_text['sentences'], ['dependencies'] for each sent :return: """ assert type(rel_words) == list assert type(sent) == list rel_tmp = [] for rel_1 in sent: # for each word in sentence, rel_1 is the relation mapper from stanford tagger dependencies # if rel_1[1] in rel_words and rel_1[2] in rel_words: # continue rel_1 = list(rel_1) # print(rel_1) # if prep_ or prepc_ is the tag # appos_tag = 1 neg_tag = 0 if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"): middle_word = rel_1[0][rel_1[0].find("_") + 1 :] rel_1 = [rel_1[1], middle_word, rel_1[2]] elif rel_1[0] == u"appos": rel_1 = [rel_1[1], rel_1[2]] # appos_tag = -1 elif rel_1[0] == u"neg": # what to do here? # neg_tag = 1 rel_1 = [rel_1[1], rel_1[2]] else: continue wrd_present = False for wrd in rel_1: if wrd in rel_words: rel_1.remove(wrd) wrd_present = True if wrd_present: # pdb.set_trace() if len(rel_1) > 0: rel_tmp.append(" ".join(rel_1)) return " ".join(rel_tmp) def expand_rels(self, tmp_rels, sent): """ add relevant sents to start or end of tmp_rels :param tmp_rels: :param sent: :return: """ # pdb.set_trace() print("sent", sent) final_rels = [] for rel_full in tmp_rels: rel_words = [rel_full[1], rel_full[2]] rel_tmp = self.expand_rels_double(rel_words, sent) final_rels.append(rel_tmp) # print('final_res:',final_rels) return final_rels def identify_rels(self, tagged_text): """ :param tagged_text: :return: """ assert "sentences" in tagged_text.keys() assert "dependencies" in tagged_text["sentences"][0].keys() all_rels = [] for sent in tagged_text["sentences"]: tmp_rels = [] for rel in sent["dependencies"]: if rel[0] in [u"nn", u"dobj"]: tmp_rels.append(rel) if len(tmp_rels) > 0: final_rels = self.expand_rels(tmp_rels, sent["dependencies"]) all_rels.append(final_rels) return all_rels def identify_word_rels(self, all_words, tagged_text): """ :param all_words: list of words/phrases :param tagged_text: :return: """ assert "sentences" in tagged_text.keys() assert "dependencies" in tagged_text["sentences"][0].keys() words_rels = {} # pdb.set_trace() for wrd in all_words: wrd_rels = [] for sent in tagged_text["sentences"]: rel_frm_sent = self.expand_rels_wordlist(wrd.split(), sent["dependencies"]) if len(rel_frm_sent) > 0: wrd_rels.append(rel_frm_sent) words_rels[wrd] = ",".join(wrd_rels) return words_rels def identify_time(self, text): """ :param text: :return: """ time_strs = [] text_tag = self.tag_text(text) for sent in text_tag["sentences"]: words = sent["words"] prev_wrd_tag = False for wrd in words: wrd_tag = wrd[1] assert type(wrd_tag) == dict # if u'Timex' in wrd_tag: # timex_string = wrd_tag['Timex'] # new_end = timex_string.rfind('</TIMEX3>') # timex_string = timex_string[:new_end] # new_start = timex_string.rfind('>') # time_word = timex_string[new_start+1:] # time_strs.append(time_word) if u"NamedEntityTag" in wrd_tag: if wrd_tag[u"NamedEntityTag"] in [u"DATE", u"TIME"]: if not prev_wrd_tag: time_strs.append(wrd[0]) else: prev_wrd = time_strs.pop() new_wrd = prev_wrd + " " + wrd[0] time_strs.append(new_wrd) prev_wrd_tag = True else: prev_wrd_tag = False else: prev_wrd_tag = False time_final = [] for wrd in time_strs: if wrd not in time_final: time_final.append(wrd) return time_final def ret_time_rels(self, text): """ :param text: :return: """ tagged_text = self.tag_text(text) all_times = self.identify_time(text) time_rels = self.identify_word_rels(all_times, tagged_text) return time_rels def return_rels(self, text): """ :param text: :return: """ text_tag = self.tag_text(text) rels_all = self.identify_rels(text_tag) return rels_all def identify_name(self, text): """ :param text: :return: """ name_strs = [] text_tag = self.tag_text(text) for sent in text_tag["sentences"]: words = sent["words"] prev_wrd_tag = False for wrd in words: wrd_tag = wrd[1] assert type(wrd_tag) == dict # if u'Timex' in wrd_tag: # timex_string = wrd_tag['Timex'] # new_end = timex_string.rfind('</TIMEX3>') # timex_string = timex_string[:new_end] # new_start = timex_string.rfind('>') # time_word = timex_string[new_start+1:] # time_strs.append(time_word) if u"NamedEntityTag" in wrd_tag: if wrd_tag[u"NamedEntityTag"] in [u"PERSON"]: if not prev_wrd_tag: name_strs.append(wrd[0]) else: prev_wrd = name_strs.pop() new_wrd = prev_wrd + " " + wrd[0] name_strs.append(new_wrd) prev_wrd_tag = True else: prev_wrd_tag = False else: prev_wrd_tag = False names_final = [] for wrd in name_strs: if wrd not in names_final: names_final.append(wrd) return names_final
from corenlp import StanfordCoreNLP import book_utils corenlp_dir = "../tools/corenlp-python/corenlp/stanford-corenlp-full-2014-01-04" corenlp = StanfordCoreNLP(corenlp_dir) corenlp.raw_parse() #raw_text_directory = "../dataset/books_txt/small_sample" #parsed = batch_parse(raw_text_directory, corenlp_dir,raw_output=True) #for books in parsed: # print books
from corenlp import StanfordCoreNLP corenlp_dir = "../../Scripts/stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) # wait a few minutes... result = corenlp.raw_parse("What is birth date of the wife of the first black president of the United States?") print(result['sentences'][0]['dependencies'])
#!/usr/bin/env python import sys, bz2 sys.path.insert(0, '/Users/timpalpant/Documents/Workspace/corenlp-python') import nltk from nltk.tree import Tree from corenlp import StanfordCoreNLP from remove_random_word import remove_random_word print("Booting StanfordCoreNLP") nlp = StanfordCoreNLP() print("Initializing train file") train = bz2.BZ2File('../data/train_v2.txt.bz2') for line in train: rline = remove_random_word(line) lparse = nlp.raw_parse(line) ltree = Tree.fromstring(lparse['sentences'][0]['parsetree']) rparse = nlp.raw_parse(rline) rtree = Tree.fromstring(rparse['sentences'][0]['parsetree']) print(ltree) print(rtree)
def scrape_func(address, website): """ Function to scrape various RSS feeds. Uses the 'keep' and 'ignore' iterables to define which words should be used in the text search. Inputs ------ address : address for the RSS feed to scrape. String. website : name of the website to scrape to be used in the filepath for the output. String. database : name of the MongoDB database that contains the collections. String? pymongo connection object? """ connection = MongoClient() db = connection.atrocities_data collection = db[website] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') corenlp_dir = 'stanford-corenlp/' corenlp_parse = StanfordCoreNLP(corenlp_dir) log = open('log_file.txt', 'a') results = pattern.web.Newsfeed().search(address, count=100, cached=False) log1 = 'There are %d results from %s \n' % (len(results), website) log.write(log1) for result in results: if website == 'nyt': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'bbc': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'reuters': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'ap': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'upi': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'xinhua': page_url = result.url.encode('ascii') page_url = page_url.replace('"', '') text = pages_scrape.scrape(page_url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'google': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) interupt = '+' * 70 log3 = '%s\nScrape %s once at %s!\n%s\n' % (interupt, website, datetime.datetime.now(), interupt) log.write(log3) log.close()
import os from nltk.tokenize import sent_tokenize from corenlp import StanfordCoreNLP # The directory in which the stanford core NLP .jar is located -- you have to # download this from their website. CORE_NLP_DIR = "stanford-corenlp-dir/" PARSER = StanfordCoreNLP(CORE_NLP_DIR) in_file = "sentences.txt" text = open(in_file, 'r').read() sentences = sent_tokenize(text) # Break the text into sentences. for i, sentence in enumerate(sentences): try: parse = PARSER.raw_parse(sentence) if i % 50 == 0: print " Entered sentence " + str(i) + " of " + str(len(sentences)) write_parse_products(parse['sentences'][0]) except Exception: print "Error on sentence:\n\t " + sentence + " \n " pass def write_parse_products(self, parse): words = parse['words'] word_objects = [] text = "" for i, word_info in enumerate(words): properties = word_info[1] token = word_info[0].lower().strip()
class BasicStanfordCoreNLP(UtteranceProcessor): ''' Basic version doesn't do anything with coref, const. and depend. parses produced by analysis. For now, words from all sentences found in the utterance are put at the top level of the utterance -- sentences are throw away, but could be used later for e.g. paragraph-level utterances. If merge_clitics, merge e.g. I 'll -> single word I'll Add spaces back in where there is no punctuation as points at which silence can be inserted during alignment Add reduced POS as well as Stanford POS ''' def load(self): self.target_nodes = self.config.get('target_nodes', '//utt') self.input_attribute = self.config.get('input_attribute', 'norm_text') self.merge_clitics = self.config.get('merge_clitics', 'True') ## string, not bool ## check tools exist: corenlp_location = os.path.join(self.voice_resources.path[c.BIN], '..', \ 'corenlp-python', 'corenlp') assert os.path.isdir(corenlp_location) sys.path.append(corenlp_location) from corenlp import StanfordCoreNLP corenlp_dir = os.path.join(corenlp_location, '..', 'stanford-corenlp-full-2014-06-16') ## Each document is to be treated as one sentence, no sentence splitting at all. ## Write config for this if necessary: corenlp_conf_name = 'no_sentence_split.properties' corenlp_conf_file = os.path.join(corenlp_location, corenlp_conf_name) if not os.path.isfile(corenlp_conf_file): data = ['annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref', \ 'ssplit.isOneSentence = true'] writelist(data, corenlp_conf_file) print 'Loading stanford corenlp modules from %s ...'%(corenlp_dir) print 'Takes a while (~20-30 seconds)...' self.models = StanfordCoreNLP(corenlp_dir, properties=corenlp_conf_name) def process_utterance(self, utt): ## _END_ node end_node = Element('token') end_node.set(self.input_attribute, '_END_') utt.append(end_node) for node in utt.xpath(self.target_nodes): assert node.has_attribute(self.input_attribute) input = node.get(self.input_attribute) analysis = self.models.raw_parse(input) ## analysis looks like this: # {'coref': ... # 'sentences': [{'parsetree': ... } # 'text': # 'dependencies': # 'indexeddependencies': # 'words': [('and', {'NamedEntityTag': 'O', \ # 'CharacterOffsetEnd': '3', 'Lemma': 'and', \ # 'PartOfSpeech': 'CC', 'CharacterOffsetBegin': '0'}), ... ] # } # ] # } ## preprocess the analysis: add spaces back between words where there is no ## punc (to use as potential silence insertion points for alignment), and ## possibly merge clitics (he 's -> he's, i ll' -> i'll) ## MERGE SUCCESSIVE PUNCTUATION TOKENS new_analysis = {} new_analysis['sentences'] = [] for sentence in analysis['sentences']: #new_sentence = copy.deepcopy(sentence) #new_sentence['words'] = [] new_words = [] for word in sentence['words']: # is there a previous word? if len(new_words) > 0: # if both space / punct: if self.all_space_or_punc(new_words[-1][0]) and self.all_space_or_punc(word[0]): prev_word = new_words.pop(-1) combined = self.merge_words(prev_word, word) new_words.append(combined) else: new_words.append(word) else: new_words.append(word) sentence['words'] = new_words new_analysis['sentences'].append(sentence) analysis = new_analysis ## MERGE CLITICS ## This also merges e.g. . '' --> .'' (given by norm scripts from ." ) at sentence ends. if self.merge_clitics == 'True': ## string not bool new_analysis = {} new_analysis['sentences'] = [] for sentence in analysis['sentences']: #print sentence new_sentence = copy.deepcopy(sentence) new_sentence['words'] = [] i = 0 while i < (len(sentence['words'])-1): this_word = sentence['words'][i] next_word = sentence['words'][i+1] if next_word[0].startswith("'") or next_word[0] == "n't": merged = self.merge_words(this_word, next_word) new_sentence['words'].append(merged) i += 2 else: new_sentence['words'].append(this_word) i += 1 last_word = sentence['words'][-1] if not(last_word[0].startswith("'") or last_word[0] == "n't"): new_sentence['words'].append(last_word) new_analysis['sentences'].append(new_sentence) analysis = new_analysis ## ADD SPACES: new_analysis = {} new_analysis['sentences'] = [] for sentence in analysis['sentences']: new_sentence = copy.deepcopy(sentence) new_sentence['words'] = [] ## For now, ignore parsetree, dependencies, indexeddependencies (sentence level) previous_lemma = '_NONE_' for word in sentence['words']: (text, word_attributes) = word this_lemma = word_attributes['Lemma'] ## Add whitespace back in to tokens to use for silence insertion in alignment later. ## Don't add it where either neighbour is punctuation, or at start of ## utt (where previous_lemma is '_NONE_': if not (self.all_space_or_punc(previous_lemma) or \ self.all_space_or_punc(this_lemma)): if previous_lemma != '_NONE_': new_sentence['words'].append((' ', {'NamedEntityTag': ' ', \ 'PartOfSpeech': ' ', 'Lemma': ' '})) previous_lemma = this_lemma new_sentence['words'].append(word) new_analysis['sentences'].append(new_sentence) analysis = new_analysis ## combine all sentences to one for now: all_words = [] for sentence in analysis['sentences']: all_words.extend(sentence['words']) ## Add stuff into the target node (probably utt): for word in all_words: (text, word_attributes) = word word_node = Element('token') ## also includes punctuation etc. word_node.set(self.input_attribute, text) ## see above at sentence level about 'text' ## For now, ignore CharacterOffsetBegin, CharacterOffsetEnd (word level) word_node.set('ne', word_attributes['NamedEntityTag']) word_node.set('pos', word_attributes['PartOfSpeech']) word_node = self.add_reduced_POS(word_node) word_node.set('lemma', word_attributes['Lemma']) utt.append(word_node) ## _END_ node end_node = Element('token') end_node.set(self.input_attribute, '_END_') utt.append(end_node) def add_reduced_POS(self, node): full_POS = node.attrib['pos'] if '|' in full_POS: full_POS = full_POS.split('|')[0] ## add coarse POS (content/function) and reduced (adj,noun,adv,etc.) map = dict([('IN', 'function'), ('TO', 'function'), ('DT', 'function'), \ ('PDT', 'function'), ('MD', 'function'), ('CC', 'function'), \ ('WP', 'function'), ('PP$', 'function'), ('EX', 'function'), \ ('POS', 'function'), ('PP', 'function'), ('WDT', 'function'), \ ('PRP', 'function'), ('PRP$', 'function'), ('RP', 'function'), \ ('WP$', 'function'), ('WRB', 'function'), ('LS', 'function'),\ ('NN', 'noun'), ('NNS', 'noun'), \ ('NP', 'noun'), ('NNP', 'noun'), ('NPS', 'noun'), ('NNPS', 'noun'), ('FW', 'noun'), \ ('VBG', 'verb'), ('VBN', 'verb'), \ ('VB', 'verb'), ('VBD', 'verb'), ('VBP', 'verb'), ('VBZ', 'verb'), \ ('JJ', 'adj'), ('JJR', 'adj'), ('JJS', 'adj'), ('CD', 'adj'), \ ('RB', 'adv'), ('RBR', 'adv'), ('RBS', 'adv'), ('UH', 'interj')]) ## NOTE: # FW -- foreign word -> noun # LS -- list item -> function if full_POS not in map: if full_POS == ' ': red_pos = 'space' elif self.all_space_or_punc(full_POS): red_pos = 'punc' else: print 'MISSING POS: %s'%(full_POS) red_pos = 'other' else: red_pos = map[full_POS] node.set('coarse_pos', red_pos) return node def all_space_or_punc(self, token): '''Use regex to match unicode properties to see if token is all punctuation or space This duplicates later work by e.g. token classifier.''' space_or_punc = '[\p{Z}||\p{C}||\p{P}||\p{S}]' return regex.match('\A' + space_or_punc + '+\Z', token) def merge_words(self, word1, word2): merged_form = word1[0] + word2[0] merged_POS = word1[1]['PartOfSpeech'] + '|' + word2[1]['PartOfSpeech'] merged_lemma = word1[1]['Lemma'] ## first word's lemma merged_NER = word1[1]['NamedEntityTag'] ## first words NE tag merged = (merged_form, \ {'PartOfSpeech': merged_POS, \ 'Lemma': merged_lemma, \ 'NamedEntityTag': merged_NER}) return merged
from corenlp import StanfordCoreNLP import simplejson as json corenlp_dir = "/home/clai/lubbock/repos-3rd/stanford-corenlp-python/stanford-corenlp-full-2015-04-20/" print "loading..." corenlp = StanfordCoreNLP(corenlp_dir) results = corenlp.raw_parse("Hello world. It's a wonderful day.") print results print json.dumps(results, indent=4)
def stanfordParse(text, corenlpDir='stanford-corenlp-full-2013-11-12/'): global stanford if stanford is None: stanford = StanfordCoreNLP(corenlpDir) return stanford.raw_parse(text)
def compress(sentence): global parser if not parser: parser = StanfordCoreNLP(corenlp_dir) text = sentence.simple words = word_tokenize(text) w_features = [dict() for w in words] stemmed = [None for w in words] labels = list() # add basic features # first/last words for i in range(1,6): if i < len(words): for x in range(i): w_features[x]["infirst"+str(i)] = True w_features[-1-x]["inlast"+str(i)] = True #pos = [ x[1] for x in nltk.pos_tag(a.o_words) ] for i in range(len(words)): w = words[i] features = w_features[i] #capitalization if w.isupper(): features["isupper"] = True elif w[0].isupper(): features["firstupper"] = True w = w.lower() #word class if w in negation: features["negation"] = True elif w in punct: features["punct"] = True elif w in stopWords: features["stopWords"] = True #pos #a.posfeatures[i]["pos_"+pos[i]] = True # compute the basic term frequencies of all words in paragraphs # for use in building corpus-wide quarry term frequency if w not in model.idf.stopWords: termFreq[w] += 1 stem = stemmer.stem(w) suffix = "" if len(stem) < len(w) and w.startswith(stem): suffix = w[len(stem):] stemmed[i] = (stem, suffix) features["stem_"+stemmed[i][0]] = True features["affix_"+stemmed[i][1]] = True #Stanford tree features text = text.encode('ascii', 'ignore') tree = None dependencies = None try: results = parser.raw_parse(text) tree = [] dependencies = [] for s in results['sentences']: tree.append(tree_re.search(s['parsetree']).group(0)) dependencies += s['dependencies'] except: print(text) print( "Unexpected error:", sys.exc_info()[0]) #print(a.tree) if tree: tree = Tree.fromstring(tree[0].encode('ascii', 'ignore')) #print(str(tree)) paths = list(getPathsToLeaves(tree)) #print(paths) for i in range(min(len(paths), len(words))): #print(paths[i][1]) w_features[i]["tree_depth_"+str(len(paths[i][1]))] = True for x in range(0,2): w_features[i][str(x)+"_up_"+paths[i][1][-1-x]] = True for n in paths[i][1]: w_features[i]["tree_"+n] = True w_features[i][str(paths[i][2])+"_from_left"] = True #print(a.treefeatures[0]) if dependencies: #make a tree out of it d_tree = defaultdict(list) mother_relations = defaultdict(list) daughter_relations = defaultdict(list) for dep in dependencies: d_tree[dep[1]].append((dep[0], dep[2])) mother_relations[dep[1]].append(dep[0]) daughter_relations[dep[2]].append(dep[0]) #now we can check depth and such #print(d_tree) depths = getDepths(d_tree, u'ROOT', dict(), 0) #print(depths) for i in range(len(words)): w = words[i] treefeatures = w_features[i] if w in depths: w_depth = depths[w] treefeatures["dep_depth_"+str(w_depth)] = True if w_depth > 3: treefeatures["dep_depth_over_3"] = True if w_depth > 5: treefeatures["dep_depth_over_5"] = True if w in mother_relations: for rel in mother_relations[w]: treefeatures["dep_mother_"+rel] = True if w in daughter_relations: for rel in daughter_relations[w]: treefeatures["dep_daughter_"+rel] = True # get max tfidf for scaling maxtfidf = max( tf*idf.idf[w] for w, tf in termFreq.items() ) partitions = 5 # now add tfidf threshold features for i in range(len(words)): w = words[i].lower() if w not in stopWords and w not in punct: features = w_features[i] tfidf = termFreq[w] * idf.idf[w] scaled = tfidf / maxtfidf * partitions for x in range(1,partitions): if tfidf > x: features[str(x*100/partitions)+"percenttfidf"] = True #for f in w_features: # print(f) # add previous features and classify for i in range(len(words)): f = w_features[i].copy() for prev in range(2): if i > prev: prevstring = "prev"+str(prev)+"_" f[prevstring+labels[-1-prev]] = True prevfeatures = w_features[i-1-prev] for k,v in prevfeatures.items(): if not k.startswith("in"): f[prevstring+k] = v #print("with prev:") #print(f) # classify vector = vec.transform(f) vector = selector.transform(vector) result = classifier.predict(vector) l = result[0] #print(l) labels.append(l) # use labels to clear out print(labels) retained_words = list() for i in range(len(labels)): if labels[i] != 'O': retained_words.append(words[i]) newsentence = "" for i in range(len(retained_words)): if i != 0 and retained_words[i] not in punct and retained_words[i-1] not in ["``"]: newsentence += " " newsentence += retained_words[i] sentence.simple = newsentence return sentence
import os from nltk.tokenize import sent_tokenize from corenlp import StanfordCoreNLP # The directory in which the stanford core NLP .jar is located -- you have to # download this from their website. CORE_NLP_DIR = "stanford-corenlp-dir/" PARSER = StanfordCoreNLP(CORE_NLP_DIR) in_file = "sentences.txt" text = open(in_file, 'r').read() sentences = sent_tokenize(text) # Break the text into sentences. for i, sentence in enumerate(sentences): try: parse = PARSER.raw_parse(sentence) if i%50 == 0: print " Entered sentence " + str(i) + " of " + str(len(sentences)) write_parse_products(parse['sentences'][0]) except Exception: print "Error on sentence:\n\t " + sentence + " \n " pass def write_parse_products(self, parse): words = parse['words'] word_objects = [] text = "" for i, word_info in enumerate(words): properties = word_info[1] token = word_info[0].lower().strip() surface = word_info[0].strip()
from corenlp import StanfordCoreNLP corenlp_dir = "../../Scripts/stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) # wait a few minutes... result = corenlp.raw_parse( "What is birth date of the wife of the first black president of the United States?" ) print((result['sentences'][0]['dependencies']))
#!/usr/bin/env python import sys, bz2 sys.path.insert(0, '/Users/timpalpant/Documents/Workspace/corenlp-python') import nltk from nltk.tree import Tree from corenlp import StanfordCoreNLP from remove_random_word import remove_random_word print "Booting StanfordCoreNLP" nlp = StanfordCoreNLP() print "Initializing train file" train = bz2.BZ2File('../data/train_v2.txt.bz2') for line in train: rline = remove_random_word(line) lparse = nlp.raw_parse(line) ltree = Tree.fromstring(lparse['sentences'][0]['parsetree']) rparse = nlp.raw_parse(rline) rtree = Tree.fromstring(rparse['sentences'][0]['parsetree']) print ltree print rtree