class NERparser(): def __init__(self): self.st = StanfordNERTagger('/home/joe32140/stanford/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz', '/home/joe32140/stanford/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8') def getNER_sents(self, sents): tokenized_sents = [word_tokenize(sent) for sent in sents] classified_sents = self.st.tag_sents(tokenized_sents) return classified_sents def count_entity(self, entity, table): if entity[0] not in table[entity[1]]: table[entity[1]][entity[0]]=str(len(table[entity[1]].keys())) return table[entity[1]][entity[0]] def replace(self, sents): classified_sents =self.getNER_sents(sents) new_sentences=[] for i, sent in enumerate(classified_sents): if i%5==0: check_repeat={'PERSON':{}, 'LOCATION':{}, 'ORGANIZATION':{}} tmp=[] for w in sent: if w[1]!='O': count = self.count_entity(w, check_repeat) tmp.append(w[1]+'_'+str(count)) else: tmp.append(w[0]) new_sentences.append(' '.join(tmp)) return new_sentences
def extract_ne(sents): from nltk.tag import StanfordNERTagger import nltk st = StanfordNERTagger('ner/english.all.3class.distsim.crf.ser.gz', 'ner/stanford-ner.jar') sents_tk = [] for sent in sents: sent_tk = nltk.word_tokenize(sent) sents_tk.append(sent_tk) ne = st.tag_sents(sents_tk) res = [] for sent in ne: last_tag = "O" en = "" sent.append(("", "O")) for (word, tag) in sent: if tag == 'O': if en != "": res.append(en); en = "" elif last_tag == tag: en += " " + word else: if en != "": res.append(en); en = "" en = word last_tag = tag return (ne, res)
def nonlocal_ner_tag_tokens(self): home = expanduser("~") os.environ['CLASSPATH'] = home + '/stanford-ner-2015-12-09' os.environ[ 'STANFORD_MODELS'] = home + '/stanford-ner-2015-12-09/classifiers' st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz", java_options='-mx4000m') stanford_dir = st._stanford_jar[0].rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) # do not tokenise text nltk.internals.config_java( options= '-tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=true"' ) self.nonlocal_ner_doc_tokens = [] temp_nonlocal_bulk_process = [] length_of_docs = [len(doc) for doc in self.tokenized_docs_by_lines] for doc_idx, doc in enumerate(self.tokenized_docs_by_lines): for line_idx, line in enumerate(doc): temp_nonlocal_bulk_process.append(line) temp_nonlocal_bulk_process = st.tag_sents(temp_nonlocal_bulk_process) current_idx = 0 for doc_len_idx, doc_len in enumerate(length_of_docs): self.nonlocal_ner_doc_tokens.append( temp_nonlocal_bulk_process[current_idx:current_idx + doc_len]) current_idx += doc_len print("NER nonlocal tagged tokens")
def get_named_entities_sents(self, sents): dir_path = os.path.dirname(os.path.realpath(__file__)) #print("ner: current working directory is ", dir_path) ner_tagger_path = dir_path + r"/resources/stanford-ner.jar" german_model = dir_path + r"/resources/german.conll.hgc_175m_600.crf.ser.gz" #print(ner_tagger_path) tagger = StanfordNERTagger(german_model, ner_tagger_path, encoding="UTF-8") # iso-8859-15 tagger.java_options = '-mx2048 -Xmx2048m -Xms2048m' nltk.internals.config_java(options='-xmx2G') print("Running named entity recognition on sentences") t0 = time() self.named_entities = tagger.tag_sents(sents) print(len(self.named_entities), " named entitites found") print("done in %0.3fs" % (time() - t0)) return self.sort_named_entities()
def create_video_roles_timeline(self, subtitle_path): if subtitle_path is None: raise SubtitleNotFound( f"Could not find video's subtitle in path: {subtitle_path}") subs = pysrt.open(subtitle_path) subs_entities_timeline_dict = {} re_brackets_split = re.compile(r"(\[.*?\]|.*?:|^\(.*?\)$)") # (\[(.* ?)\] | (.* ?)\: | ^ \((.* ?)\)$) cc = RemoveControlChars() subs_clean = [ cc.remove_control_chars(s.text.strip('-\\\/').replace("\n", " ")) for s in subs ] subs_clean = [re.sub(r'<[^<]+?>', '', s) for s in subs_clean] brackets = [re_brackets_split.findall(s) for s in subs_clean] subs_text = [word_tokenize(s) for s in subs_clean] st = StanfordNERTagger(STANFORD_NLP_MODEL, encoding='utf-8', path_to_jar=STANFORD_NLP_JAR) nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'textcat']) entities_spacy = [[(ent.text, ent.label_) for ent in nlp(s).ents] for s in subs_clean] entities_nltk = st.tag_sents(subs_text) for s, e_n, e_s, b in zip(subs, entities_nltk, entities_spacy, brackets): roles = self._video_role_analyzer.find_roles_names_in_text_ner( e_n, e_s) for item in b: roles.update( self._video_role_analyzer.find_roles_names_in_text(item)) # role_counter.update(roles) if len(roles) > 0: t = s.start.seconds + s.start.minutes * 60 subs_entities_timeline_dict[t] = roles logging.debug(str(subs_entities_timeline_dict)) return subs_entities_timeline_dict
class NERTagger(): def __init__(self): stanford_ner_dir = '/Users/Rena/StandfordParserData/stanford-ner-2018-02-27/' eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz' my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar' self.tagger = StanfordNERTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar) self.ner_cache = {} self.time_list = [ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ] self.ordinal_list = [ 'first', 'largest', 'highest', 'second', 'third', 'fourth', 'fifth', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten' ] def cache_sents(self, sents): ###cache the documents### tokenised_sent = map(lambda x: x.split(), sents) tagged = self.tagger.tag_sents(tokenised_sent) for i in range(len(tokenised_sent)): self.ner_cache[sents[i]] = tagged[i] return True def tag(self, sents): pattern = '([^A-Z]\.\s[A-Z])' if re.search(pattern, sents): sentences = self.split_para(sents) entity_list = [] for s in sentences: try: tagged = self.ner_cache[s] except KeyError: sent = map(self.strip_word, s.split()) tagged = self.tagger.tag(sent) self.ner_cache[s] = tagged entity = self.entity_parse(tagged) entity_list.append(entity) result = sum(entity_list, []) else: try: tagged_sents = self.ner_cache[sents] except KeyError: sen = map(self.strip_word, sents.split()) tagged_sents = self.tagger.tag(sen) self.ner_cache[sents] = tagged_sents result = self.entity_parse(tagged_sents) return result def strip_word(self, word): pattern = ('"",:.?!;' '') return word.strip(pattern) def split_para(self, para): pattern = '([^A-Z]\.\s[A-Z])' splitted = re.split(pattern, para) matching_pattern = '^[^A-Z]\.\s[A-Z]$' for i in range(len(splitted)): if re.match(matching_pattern, splitted[i]): symbols = splitted[i].split() try: splitted[i - 1] += symbols[0] splitted[i + 1] = symbols[1] + splitted[i + 1] except: continue proper_splitted = [] for i in range(len(splitted)): if re.match(matching_pattern, splitted[i]): continue else: proper_splitted.append(splitted[i]) return proper_splitted def entity_parse_detail(self, tagged_sent): ###entity parsing method for detailed tagset### start = True retagged_entity = [] for item in tagged_sent: token, tag = item if token.lower() in self.time_list: tag = 'MONTH' elif re.match('^[1|2][0-9]{3,3}$', token): tag = 'YEAR' elif token.lower() in self.ordinal_list: tag = 'NUMBER' elif tag == 'ORGANIZATION': tag = 'OTHER' elif tag == 'O': if not start and len(token) > 0 and token[0].isupper(): tag = 'OTHER' elif any(char == '%' for char in token): tag = 'NUMBER' #'PERCENT' elif any(char == '$' for char in token): tag = 'NUMBER' #'MONEY' elif any(char.isdigit() for char in token): tag = 'NUMBER' if start: start = False retagged_entity.append((token, tag)) retagged_entity = self.retag_date(retagged_entity) return self.gather_entity(retagged_entity) def retag_date(self, tagged_entity): ###gather NUMBER MONTH YEAR pattern into DATE### result_entity = [] i = 0 while i < len(tagged_entity) - 2: (token1, tag1) = tagged_entity[i] if tag1 == 'NUMBER': (token2, tag2) = tagged_entity[i + 1] if tag2 == 'MONTH': (token3, tag3) = tagged_entity[i + 2] if tag3 == 'YEAR': result_entity.append((token1, 'DATE')) result_entity.append((token2, 'DATE')) result_entity.append((token3, 'DATE')) i = i + 3 continue elif tag1 == 'MONTH': (token2, tag2) = tagged_entity[i + 1] if tag2 == 'NUMBER': (token3, tag3) = tagged_entity[i + 2] if tag3 == 'YEAR': result_entity.append((token1, 'DATE')) result_entity.append((token2, 'DATE')) result_entity.append((token3, 'DATE')) i = i + 3 continue result_entity.append((token1, tag1)) i += 1 for counter in range(len(tagged_entity) - i): result_entity.append(tagged_entity[i + counter]) return result_entity def entity_parse(self, tagged_sent): ###entity parsing for general tagset### start = True retagged_entity = [] for item in tagged_sent: token, tag = item if token.lower() in self.time_list: tag = 'NUMBER' if token.lower() in self.ordinal_list: tag = 'NUMBER' if tag == 'ORGANIZATION': tag = 'OTHER' if tag == 'O': if not start and len(token) > 0 and token[0].isupper(): tag = 'OTHER' elif any(char.isdigit() for char in token): tag = 'NUMBER' if start: start = False retagged_entity.append((token, tag)) return self.gather_entity(retagged_entity) def gather_entity(self, retagged_entity): ###gather continuous entities### gathered_entity = [] tag = 'O' token = '' for (new_token, new_tag) in retagged_entity: if tag == new_tag: token = token + ' ' + new_token else: if tag != 'O': gathered_entity.append((token, tag)) tag = new_tag token = new_token if tag != 'O': gathered_entity.append((token, tag)) return gathered_entity
class Novel: def __init__(self, txt_file): CLASSIFIER = 'english.muc.7class.distsim.crf.ser.gz' root = os.path.join(os.getcwd(), '..', 'libraries', 'stanford-ner-2018-10-16') ner_jar_file = os.path.join(root, 'stanford-ner.jar') ner_classifier = os.path.join(root, 'classifiers/' + CLASSIFIER) self.tagger = StanfordNERTagger(ner_classifier, ner_jar_file, encoding='utf-8') np.set_printoptions(threshold=sys.maxsize) logging.getLogger().setLevel(logging.INFO) STOP = stopwords.words('english') + list(string.punctuation) self.file = txt_file self.text = '' self.persons = [] self.sentences = [] self.aliases = [] def read(self, path=''): if os.path.isfile(path + self.file): for encode in ENCODING: try: file = open(path + self.file, 'r', encoding=encode) text = file.read() self.original_text = text self.text = re.sub(pattern='\s+', repl=' ', string=self.text).strip() # text = text.replace('\n', ' ') text = re.sub(' +', ' ', text) text = text.strip() self.text = text self.sentences = sent_tokenize(text) break except IOError: logging.error('\t Cannot open ' + self.file) exit(-1) except UnicodeDecodeError: logging.warning('\t Cannot open file using encoding ' + encode + ' trying a new encoding!') def custom_coref_resolved(self, doc): ''' Use this method instead of doc._.coref_resolved, here we clean the character's name before to replace it. That because sometimes the coref method identifies commas, quotes,... as part of the name''' clusters = doc._.coref_clusters resolved = list(tok.text_with_ws for tok in doc) for cluster in clusters: for coref in cluster: if coref != cluster.main: new_name = cluster.main.text.translate(str.maketrans('', '', string.punctuation)).strip() resolved[coref.start] = new_name + doc[coref.end - 1].whitespace_ for i in range(coref.start + 1, coref.end): resolved[i] = "" return ''.join(resolved) def coreference(self): nlp = spacy.load("en_core_web_sm") coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') words = self.dealiased_text.split(' ') words_number = len(words) badge_size = 100000 if words_number > badge_size: if words_number % badge_size == 0: iterations = int(words_number / badge_size) else: iterations = int(words_number / badge_size) iterations += 1 new_text = "" for i in range(0, iterations): logging.info('Coreferencing part ' + str(i + 1) + ' of ' + str(iterations)) from_index = i * badge_size to_index = (i+1) * badge_size sub_text = ' '.join(words[from_index:to_index]) text_coreference = nlp(sub_text) # text = text_coreference._.coref_resolved new_text += self.custom_coref_resolved(text_coreference) else: new_text = self.dealiased_text self.dealiased_text = new_text def create_cluster_repetitions_df(self): self.cluster_repetitions_df = pd.DataFrame( data=[['CCHARACTER' + str(key), val[0], val[1]] for key, val in self.cluster_repetitions.items()], columns=['Alias', 'Names', 'Occurrences']) def parse_persons(self): people = {} name = "" # contains_punctuations = False tokenized_sentences = [wtk(sentence) for sentence in self.sentences] tagged_sentences = self.tagger.tag_sents(tokenized_sentences) for sentence in tagged_sentences: for word, tag in sentence: # a name is made of 1 or more names, read all if tag == 'PERSON': if len(word) == 1: # print(word) continue # all strange symbols: '!"”“#$%"&\'’()*+,./:;<=>?@[]^_`{|}~ʹ' # if word start or end with special characters, drop them # if word[0] in '!"”"“#$%"&\'’()*+,/:;<=>?@[]^_`{|}~ʹ': # word = word[1:] # contains_punctuations = True # if word[-1] in '!"”"“#$%"&\'’()*+,/:;<=>?@[]^_`{|}~ʹ': # word = word[:-1] # contains_punctuations = True if name == "": name += word else: name += " " + word else: # name is not empty if name: name = name.strip() current_name = name.split(" ") # if len(current_name) >= 2 and contains_punctuations: # print(name) # Usually and/ed/or are identified as name, e.g. Tom and Jerry if len(current_name) == 3 and (current_name[1] == 'and' or current_name[1] == 'to' or \ current_name[1][-2:] == 'ed' or current_name[1] == 'or' or \ current_name[1] == 'nor'): people[current_name[0]] = people.get(current_name[0], 0) + 1 people[current_name[2]] = people.get(current_name[2], 0) + 1 # Usually 2 words name contains adverbs or adjectives (...ly) verb (...ed), remove them elif len(current_name) == 2 and ((current_name[1] in string.punctuation) or \ (current_name[1][-2:] == 'ed') or \ (current_name[1][-2:]) == 'ly' or \ (current_name[1].lower() in CONJUNCTIONS)): people[current_name[0]] = people.get(current_name[0], 0) + 1 elif len(current_name) == 1 and current_name[0] in FALSE_POSITIVES: name = "" else: people[name] = people.get(name, 0) + 1 name = "" # contains_punctuations = False self.persons = collections.OrderedDict(sorted(people.items())) return def cluster_aliases(self): complete_alphabet_names = collections.defaultdict(list) simplified_alphabet_names = collections.defaultdict(list) for name in self.persons: split_name = name.lower().split() new_name = "" if len(split_name) == 1: # single names do not have pre-names new_name = split_name[0] else: for name_part in split_name: is_prename = False for pre_name in PRE_NAMES: if name_part == pre_name: is_prename = True if not is_prename: new_name += " " + name_part new_name = new_name.strip() if len(new_name) == 0: new_name = name complete_alphabet_names[new_name[0].upper()].append(name) simplified_alphabet_names[new_name[0].upper()].append(new_name) clusters_number = 0 db_names = defaultdict(list) db_simplified_names = defaultdict(list) for letter, names in simplified_alphabet_names.items(): n_persons = len(names) similarities = np.empty((n_persons, n_persons)) if len(names) == 1: db_names[clusters_number].append(complete_alphabet_names[letter][0]) db_simplified_names[clusters_number].append(simplified_alphabet_names[letter][0]) clusters_number += 1 continue for i, person1 in enumerate(names): for j, person2 in enumerate(names): # differ = difflib.SequenceMatcher(None, person1, person2) similarities[i][j] = differ.ratio() # similarities[i][j] = fuzz.ratio(person1, person2)/100. # similarities[i][j] = fuzz.token_sort_ratio(person1, person2) / 100. # similarities[i][j] = fuzz.token_set_ratio(person1, person2) / 100. # take the shortest word and find the # similarity between this name and each subslice of the longer name (with the same length). It # returns the higher value. similarities[i][j] = fuzz.partial_ratio(person1, person2) / 100. # eps = find_best_eps(similarities) # print(letter, ': ', eps) eps = 0.3 db = DBSCAN(metric='precomputed', min_samples=1, algorithm='brute', eps=eps).fit(1 - similarities) labels = db.labels_ if -1 in labels: logging.info('Some names are not clustered') for i, name in enumerate(complete_alphabet_names[letter]): db_names[labels[i] + clusters_number].append(name) simplified_name = simplified_alphabet_names[letter][i] db_simplified_names[labels[i] + clusters_number].append(simplified_name) unique = np.unique(labels, return_counts=False) clusters_number += len(unique) cluster_rep = {} simple_cluster_rep = {} for id, some_names in db_names.items(): repetitions = [] for name in some_names: repetitions.append(self.persons[name]) cluster_rep[id] = (some_names, repetitions) simple_cluster_rep[id] = (db_simplified_names[id], repetitions) # Debug here to discover which names are correctly clustered self.cluster_repetitions = cluster_rep self.simple_cluster_repetitions = simple_cluster_rep def find_persons_title(self): text = self.text.replace('\n', ' ') new_names = {} for name, occurrence in self.persons.items(): pre_names = re.findall(r'([^ \r\n]+)( ' + name + ')([\r\n]| |$|.)', text, re.IGNORECASE) if len(pre_names) == 0: continue pre_names_occurrences = collections.defaultdict(int) for pre_name in pre_names: # skip prename which end with punctuations, it is not in the same phrase as the subject if pre_name[0][-1] in '!"”“#$%"&\'’()*+,./:;<=>?@[]^_`{|}~ʹ': continue pre_names_occurrences[pre_name[0]] += 1 if len(pre_names_occurrences) == 0: continue max_index = np.argmax(pre_names_occurrences.values()) max_occurrence = list(pre_names_occurrences.values())[max_index] new_prename = list(pre_names_occurrences.keys())[max_index] if float(max_occurrence) / float(occurrence) > 0.5 and max_occurrence > 1: # skip special starting character in the pre-name if new_prename[0] in '!"”"“#$%"&\'’()*+,./:;<=>?@[]^_`{|}~ʹ': new_prename = new_prename[1:] + ' ' + name if new_prename.lower() not in CONJUNCTIONS: new_name = new_prename + ' ' + name logging.info('Adding new name: %s', new_name) new_names[new_name] = max_occurrence PRE_NAMES.add(new_prename.lower()) logging.info('Adding new pre-name: %s', new_prename.lower()) persons = self.persons for new_name, occurrence in new_names.items(): if new_name not in persons: persons[new_name] = occurrence if new_name in persons: persons[new_name] += occurrence self.persons = collections.OrderedDict(sorted(persons.items())) def filter_similar_names(self, similarity): # Winsley is contained in many cluster, insert it into the cluster with more repetitions old_similarity = similarity for key_a, value_a in old_similarity.items(): if len(value_a) > 1: id_best = -1 best = -1 for id in value_a: repetitions = self.cluster_repetitions[id][1] sum_repetitions = sum(repetitions) if sum_repetitions > best: best = sum_repetitions id_best = id similarity[key_a] = [id_best] # add similar names to a cluster new_cluster = self.cluster_repetitions new_simple_cluster = self.simple_cluster_repetitions to_remove = set() to_delete_at_end = set() for key_a, value_a in similarity.items(): if key_a in to_remove: continue # find other names with the same preference same_preferences = set() for key_b, value_b in similarity.items(): if value_b[0] == value_a[0]: same_preferences.add(key_b) # more key with the same preference selected_cluster = -1 if len(same_preferences) > 1: # take the max max = -1 best_key = -1 for preference in same_preferences: occurrences = sum(self.cluster_repetitions[preference][1]) + sum( self.cluster_repetitions[similarity[preference][0]][1]) if occurrences > max: max = occurrences best_key = preference for preference in same_preferences: if preference != key_a: to_remove.add(preference) selected_cluster = best_key else: selected_cluster = list(same_preferences)[0] # check if the value of the best is also a key value = similarity[selected_cluster][0] if value in similarity: # the similarity is symmetric? A wants B and B wants A? if similarity[value] != selected_cluster: # take the max and remove the other, a=AB and b=BC occurrences_a = sum(self.cluster_repetitions[selected_cluster][1]) + sum( self.cluster_repetitions[value][1]) occurrences_b = sum(self.cluster_repetitions[value][1]) + sum( self.cluster_repetitions[similarity[value][0]][1]) non_selected_cluster = selected_cluster if np.argmin([occurrences_a, occurrences_b]) == 0 else value selected_cluster = selected_cluster if np.argmax([occurrences_a, occurrences_b]) == 0 else value to_remove.add(selected_cluster) to_remove.add(non_selected_cluster) else: to_remove.add(selected_cluster) to_remove.add(value) else: to_remove.add(selected_cluster) # Update both the list with original names and the one with simplified names add_user = new_cluster[selected_cluster][0] add_repetition = new_cluster[selected_cluster][1] add_simple_user = new_simple_cluster[selected_cluster][0] cluster_repetitions = new_cluster[similarity[selected_cluster][0]] cluster_repetitions[0].extend(add_user) # next operation will update both the original and the simple names list cluster_repetitions[1].extend(add_repetition) cluster_repetitions = new_simple_cluster[similarity[selected_cluster][0]] cluster_repetitions[0].extend(add_simple_user) to_delete_at_end.add(selected_cluster) return to_delete_at_end, new_cluster, new_simple_cluster def associate_simple_single_names(self): single_names = [] single_ids = [] multiple_names = [] multiple_ids = [] # find clusters composed by only 1 name and clusters with more names for id, names_repetitions in self.simple_cluster_repetitions.items(): names = names_repetitions[0] if len(names) == 1 or all(name == names[0] for name in names): single_names.append(names_repetitions) single_ids.append(id) else: multiple_names.append(names_repetitions) multiple_ids.append(id) # compute the similarity between the single names and all other clusters (also other single names) similarity = {} for key_a, single_name_repetitions in zip(single_ids, single_names): single_name = single_name_repetitions[0][0] # single_repetition = single_name_repetitions[1][0] for id, names_repetitions in self.simple_cluster_repetitions.items(): if key_a != id: names = names_repetitions[0] for name in names: if single_name in name or name in single_name: # print(single_name, ' - ', names) if key_a not in similarity: similarity[key_a] = [] similarity[key_a].append(id) break to_delete_at_end, new_cluster, new_simple_cluster = self.filter_similar_names(similarity) fix_indexes_cluster, fix_simple_indexes_cluster = self.delete_names_bottom_up(to_delete_at_end, new_cluster, new_simple_cluster) self.cluster_repetitions = fix_indexes_cluster self.simple_cluster_repetitions = fix_simple_indexes_cluster def delete_names_bottom_up(self, to_delete_at_end, new_cluster, new_simple_cluster): # delete bottom up, to eliminate problem with indexes to_delete_at_end = sorted(list(to_delete_at_end), key=lambda x: x, reverse=True) for key_a in to_delete_at_end: del new_cluster[key_a] del new_simple_cluster[key_a] fix_indexes_cluster = {} fix_simple_indexes_cluster = {} i = 0 for cluster_idx, values in new_cluster.items(): fix_indexes_cluster[i] = values fix_simple_indexes_cluster[i] = new_simple_cluster[cluster_idx] i += 1 return fix_indexes_cluster, fix_simple_indexes_cluster def associate_single_names(self): similarity = {} for id1, value1 in self.cluster_repetitions.items(): if len(value1[0]) == 1: for id2, value2 in self.cluster_repetitions.items(): if id1 != id2: single_name = value1[0][0] if any(single_name in name for name in value2[0]): if id1 not in similarity: similarity[id1] = [] similarity[id1].append(id2) to_delete_at_end, new_cluster, new_simple_cluster = self.filter_similar_names(similarity) fix_indexes_cluster, fix_simple_indexes_cluster = self.delete_names_bottom_up(to_delete_at_end, new_cluster, new_simple_cluster) self.cluster_repetitions = fix_indexes_cluster self.simple_cluster_repetitions = fix_simple_indexes_cluster def dealiases(self): replacements = {} for id, names_rep in self.cluster_repetitions.items(): character = 'CCHARACTER' + str(id) names = names_rep[0] for name in names: replacements[name] = character ordered_replacements = {} for k in sorted(replacements, key=len, reverse=True): ordered_replacements[k] = replacements[k] self.dealiased_text = replace_words(self.text, ordered_replacements) return def store(self, filename, data, type='csv'): if type == 'csv': try: with open(filename, 'w', newline='', encoding="utf-8") as csvfile: writer = csv.writer(csvfile) for key, value in data.items(): writer.writerow([key, value]) except IOError: logging.info("I/O error") else: with open(filename, "w", encoding="utf-8") as f: f.write(data) def remove_less_than(self, occurrences): new_persons = {} for name, occurrence in self.persons.items(): if occurrence <= occurrences: continue else: new_persons[name] = occurrence self.persons = new_persons return
from nltk.tag import StanfordNERTagger import pandas as pd from sklearn.metrics import f1_score, confusion_matrix from loader import Load train, test = Load('c') ner = StanfordNERTagger('./stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', './stanford-ner-2018-10-16/stanford-ner.jar') data = train data['tweet'] = ner.tag_sents(data['tweet'].str.split(' ')) pred = [] for i, d in data.iterrows(): tweet = d['tweet'] tag = 'IND' for w in tweet: if w[1] == 'ORGANIZATION': tag = 'GRP' # elif w[1] == 'PEOPLE': # tag = 'IND' pred.append(tag) print(confusion_matrix(data['label'], pred)) print(f1_score(data['label'], pred, average='macro'))
class NER: # Any one-time initialization code can go here. There entire nested question-and-answer # dataset is passed as a parameter, in case the initialization requires any of that data. def init(self, allQuestions): os.environ[ "STANFORD_MODELS"] = "./Features/stanford-ner-2014-06-16/classifiers/" os.environ[ "CLASSPATH"] = "./Features/stanford-ner-2014-06-16/stanford-ner.jar" self.nerMachine = StanfordNERTagger( 'english.all.3class.distsim.crf.ser.gz') sentences = [] ids = [] for q in allQuestions: ids.append("Q" + allQuestions[q]['id']) sentences.append(allQuestions[q]['question_words']) for r in allQuestions[q]['related']: ids.append("R" + allQuestions[q]['related'][r]['id']) sentences.append( allQuestions[q]['related'][r]['question_words']) tagged = self.nerMachine.tag_sents(sentences) for i in range(0, len(ids)): id = ids[i] if id[0] == 'Q': qid = id[1:] allQuestions[qid]['ner'] = tagged[i] else: rid = id[1:] allQuestions[qid]['related'][rid]['ner'] = tagged[i] return # Given a specific question, return a feature vector (one-dimensional array of one # or more features. def createFeatureVector(self, question, parentQuestion): # This is just placeholder code - insert code that actually generates a feature vector here # for the given question, and then return that feature vector instead of [0]. #question['ner']=self.nerMachine.tag(question['question_words']) #parentQuestion['ner'] = self.nerMachine.tag(parentQuestion['question_words']) #pprint(question['question_words']) #pprint(question['ner']) # print(question['ner']) # print(parentQuestion['ner']) qNer = [] pNer = [] for i in question['ner']: for j in parentQuestion['ner']: if not i[1] == 'O': qNer.append(str.lower(i[0])) if not j[1] == 'O': pNer.append(str.lower(j[0])) feature = len(list(set(qNer).intersection(pNer))) # for i in parentQuestion['ner']: # if not i[1]=='O': # pNer.append(str.lower(i[0])) # print(qNer) # print(pNer) # feature=len(list(set(qNer).intersection(pNer))) # print(feature) #Calculate named entity overlap return [feature] # Returns a list of names for the features generated by this module. Each entry in the # list should correspond to a feature in the createFeatureVector() response. def getFeatureNames(self): return ['NER']
class NLPCore: """ nlp processing including Stanford Word Segmenter, Stanford POS Tagger, Stanford Named Entity Recognizer and Stanford Parser """ def __init__(self): self.root_path = '../Models/stanfordNLP/' # word segmenter self.segmenter = StanfordSegmenter( path_to_jar=self.root_path + "stanford-segmenter.jar", path_to_slf4j=self.root_path + "log4j-over-slf4j.jar", path_to_sihan_corpora_dict=self.root_path + "segmenter/", path_to_model=self.root_path + "segmenter/pku.gz", path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz") # pos tagger self.posTagger = StanfordPOSTagger( self.root_path + 'pos-tagger/chinese-distsim.tagger', path_to_jar=self.root_path + "stanford-postagger.jar") # named entity recognizer self.nerTagger = StanfordNERTagger( self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz', path_to_jar=self.root_path + 'stanford-ner.jar') self.parser = StanfordDependencyParser( model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz', path_to_jar=self.root_path + 'stanford-parser.jar', path_to_models_jar=self.root_path + 'stanford-parser-3.7.0-models.jar', encoding='gbk') def split_sent_stanford(self, textPair): """ Stanford Word Segmenter, input should be raw text :return: also TextPair with raw string of results """ t1 = self.segmenter.segment(textPair.t1) t2 = self.segmenter.segment(textPair.t1) if DEBUG: print(t1, t2) return text_pair.TextPair(t1, t2, textPair.label) def split_sents_stanford(self, textPairs): """ Stanford Word Segmenter, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1 for textPair in textPairs] sents2 = [textPair.t2 for textPair in textPairs] split1 = self.segmenter.segment_sents(sents1).split('\n') split2 = self.segmenter.segment_sents(sents2).split('\n') rlist = [] for i in range(len(textPairs)): rlist.append( text_pair.TextPair(split1[i], split2[i], textPairs[i].label)) if DEBUG: print(split1[i], split2[i]) return rlist def split_sent_jieba(self, textPair): jieba.setLogLevel('INFO') ger1 = jieba.cut(textPair.t1) ger2 = jieba.cut(textPair.t2) t1 = ' '.join(ger1) t2 = ' '.join(ger2) return text_pair.TextPair(t1, t2, textPair.label) def pos_tag(self, textPair): """ Stanford POS Tagger, input should be splitted :return: also TextPair with raw string of results """ t1_s = textPair.t1.split() t2_s = textPair.t2.split() t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)]) t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)]) if DEBUG: print(t1_tag, t2_tag) return text_pair.TextPair(t1_tag, t2_tag, textPair.label) def pos_tag_pairs(self, textPairs): """ Stanford POS Tagger, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1.split() for textPair in textPairs] sents2 = [textPair.t2.split() for textPair in textPairs] tag1 = self.posTagger.tag_sents(sents1) tag2 = self.posTagger.tag_sents(sents2) rlist = [] for i in range(len(tag1)): t1_tag = ' '.join([ele[1] for ele in tag1[i]]) t2_tag = ' '.join([ele[1] for ele in tag2[i]]) rlist.append(text_pair.TextPair(t1_tag, t2_tag, textPairs[i].label)) if DEBUG: print(t1_tag, t2_tag) return rlist def ner_tag(self, textPair): """ Stanford Named Entity Recognizer, input should be splitted :return: also TextPair with raw string of results """ t1_s = textPair.t1.split() t2_s = textPair.t2.split() t1_ner = ' '.join( [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)]) t2_ner = ' '.join( [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)]) if DEBUG: print(t1_ner, t2_ner) return text_pair.TextPair(t1_ner, t2_ner, textPair.label) def ner_tag_pairs(self, textPairs): """ Stanford Named Entity Recognizer, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1.split() for textPair in textPairs] sents2 = [textPair.t2.split() for textPair in textPairs] tag1 = self.nerTagger.tag_sents(sents1) tag2 = self.nerTagger.tag_sents(sents2) rlist = [] for i in range(len(tag1)): t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]]) t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]]) rlist.append(text_pair.TextPair(t1_ner, t2_ner, textPairs[i].label)) if DEBUG: print(t1_ner, t2_ner) return rlist def depen_parse(self, textPair): """ Stanford Dependency Parser, input should be splitted :return: also TextPair with raw string of results """ print([p.tree() for p in self.parser.raw_parse(textPair.t1)])
class TextPreprocesser(object): def __init__(self, decode_error='strict', strip_accents='unicode', ignore_list=[], lowercase=True, \ remove_html=True, join_urls=True, use_bigrams=True, use_ner=True, stanford_ner_path="", \ use_lemmatizer=False, max_df=0.95, min_df=1, max_features=None): self.stanford_ner_path = stanford_ner_path # path to stanford NER self.decode_error = decode_error # options: {‘strict’, ‘ignore’, ‘replace’} self.strip_accents = strip_accents # options: {‘ascii’, ‘unicode’, None} self.ignore_list = ignore_list self.lowercase = lowercase self.remove_html = remove_html self.join_urls = join_urls self.use_bigrams = use_bigrams self.use_ner = use_ner self.use_lemmatizer = use_lemmatizer # use lemmatizer instead of stemmer? self.max_df = max_df # maximum document frequency self.min_df = min_df # remove terms that occur in less than min_df documents self.max_features = max_features # keep only top-N words according to tf across corpus self.sentence_splitter = PunktSentenceTokenizer( ).tokenize # Punkt sentence splitter self.stemmer = SnowballStemmer("english").stem # Snowball stemmer self.lemmatizer = WordNetLemmatizer().lemmatize # WordNet lemmatizer self.base_tokenizer = CountVectorizer().build_tokenizer( ) # sklearn tokenizer works the best, I think... self.stop_words = stopwords.words( "english") # nltk list of 128 stopwords self.token_pattern = re.compile( r'(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b' ) # default value was r'(?u)\b\w\w+\b' self.numeric_pattern = re.compile(r'^[0-9]+$') # number regex self.url_pattern = re.compile(r'((http://)?(www\..*?\.\w+).*?)\s') self.compound_pattern = re.compile(r'\w+(\-\w+)+') if self.use_lemmatizer: self.tokenizer = CustomTokenizer(self.base_tokenizer, self.lemmatizer, self.token_pattern, self.numeric_pattern) else: self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer, self.token_pattern, self.numeric_pattern) def find_nbest_bigrams(self, corpus, n, metric, min_freq): print "finding top-%d bigrams using %s..." % (n, metric) alltokens = [] simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x, re.compile(".*"), re.compile("^$")) for doc in corpus: for token in [t for t in simplerTokenizer(doc)]: alltokens.append(token) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(alltokens) finder.apply_freq_filter( min_freq) # bigrams must appear at least 5 times if metric.lower() == "pmi": best_bigrams = finder.nbest(bigram_measures.pmi, n) # doctest: +NORMALIZE_WHITESPACE elif metric.lower() == "chi_sq": best_bigrams = finder.nbest(bigram_measures.chi_sq, n) # doctest: +NORMALIZE_WHITESPACE else: raise Exception("Unknown metric for bigram finder") return best_bigrams def remove_punctuation(self, text): if not hasattr(self, 'simplerTokenizer'): self.simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x, self.token_pattern, self.numeric_pattern) tokens = self.simplerTokenizer(text) return ' '.join(tokens) def tag_corpus_ner(self, corpus): if not hasattr(self, 'stanford_ner'): self.stanford_ner = StanfordNERTagger( self.stanford_ner_path + "classifiers/english.all.3class.distsim.crf.ser.gz", self.stanford_ner_path + "stanford-ner.jar") self.stanford_ner._stanford_jar = self.stanford_ner_path + "stanford-ner.jar:" + self.stanford_ner_path + "lib/*" print "splitting sentences in corpus (for NER)..." corpus_sentences = [] sentence_to_doc_map = {} sent_no = 0 for d in xrange(len(corpus)): for sent in self.sentence_splitter(corpus[d]): corpus_sentences.append(sent) sentence_to_doc_map[sent_no] = d sent_no += 1 tokenized_sentences = [] for sent in corpus_sentences: tokenized_sentences.append( [t for t in re.split(r'\s+', sent) if len(t) > 0]) #tokenized_sentences = [re.split(r'\s+', sent) for sent in corpus_sentences] print "tagging sentences with Stanford NER..." tagged_sentences = self.stanford_ner.tag_sents(tokenized_sentences) # process NER output tagged_corpus = [] current_doc_no = 0 current_doc = [] for i in xrange(len(tagged_sentences)): doc_no = sentence_to_doc_map[i] if doc_no == current_doc_no: current_doc += tagged_sentences[i] else: tagged_corpus.append(current_doc) current_doc = [] current_doc_no = doc_no tagged_corpus.append(current_doc) # get dictionary of named entities per document named_entities = [] for tagged_doc in tagged_corpus: tags = {} current_ne = [] for token, tag in tagged_doc: if current_ne: if tag == "O" or (tag != "O" and tag != current_ne[-1][1]): tags[' '.join([t for t, _ in current_ne ])] = current_ne[0][1] current_ne = [] if tag != "O": current_ne.append((token, tag)) if current_ne: tags[' '.join([t for t, _ in current_ne])] = current_ne[0][1] named_entities.append(tags) return tagged_corpus, named_entities def preprocess_corpus(self, corpus): print "preprocessing corpus..." print "corpus size:", len(corpus) # first pass over the corpus: prepare for NER print "first pass over the corpus...\n\tunescape characters" if self.remove_html: print "\tremove html" if self.strip_accents: print "\tstrip accents" if self.join_urls: print "\tjoin URLs" print "\tjoin compound words\n\tspace out punctuation" for d in xrange(len(corpus)): corpus[d] = HTMLParser.HTMLParser().unescape(corpus[d]) + " " if self.remove_html: corpus[d] = remove_html(corpus[d]) if self.strip_accents == 'unicode': corpus[d] = strip_accents_unicode(corpus[d]) if self.join_urls: corpus[d] = join_urls(corpus[d], self.url_pattern) corpus[d] = join_compound_words(corpus[d], self.compound_pattern) corpus[d] = space_out_punctuation(corpus[d]) if self.use_ner: tagged_corpus, named_entities = self.tag_corpus_ner(corpus) # debug NER fw = codecs.open("debug_NER.txt", "w", "utf-8") for tags in named_entities: fw.write(unicode(tags.items()) + "\n") fw.close() print "merging named entities as single tokens..." for d in xrange(len(corpus)): tags = named_entities[d] for ne in tags: corpus[d] = corpus[d].replace(ne, re.sub(r'\s+', '', ne)) # second pass over the corpus: remove punctuation and convert to lowercase # (these were useful above for NER, but now can be removed) print "second pass over the corpus..." print "\tremove punctuation" if self.lowercase: print "\tconvert to lowercase" simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x, self.token_pattern, self.numeric_pattern) for d in xrange(len(corpus)): corpus[d] = self.remove_punctuation(corpus[d]) if self.lowercase: corpus[d] = corpus[d].lower() if self.use_bigrams: # find top N bigrams #best_bigrams = self.find_nbest_bigrams(corpus, 100, "pmi", 10) best_bigrams = self.find_nbest_bigrams(corpus, 100, "chi_sq", 10) # debug bigrams fw = codecs.open("debug_bigrams.txt", "w", "utf-8") for w1, w2 in best_bigrams: fw.write(w1 + " " + w2 + "\n") fw.close() print "merging bigrams as single tokens..." for d in xrange(len(corpus)): for w1, w2 in best_bigrams: corpus[d] = corpus[d].replace(w1 + " " + w2, w1 + w2) return corpus def convert_to_bag_of_words(self, corpus): print "converting corpus to bag-of-words format..." print "\ttokenize documents\n\tremove stopwords" print "\tapply lemmatizer" if self.use_lemmatizer else "\tapply stemmer" print "\tremove rare words\n\tremove very frequent words" vectorizer = CountVectorizer(input='content', decode_error=self.decode_error, strip_accents=self.strip_accents, tokenizer=self.tokenizer, stop_words=self.stop_words + self.ignore_list, lowercase=self.lowercase, max_df=self.max_df, min_df=self.min_df, max_features=self.max_features) dtm = vectorizer.fit_transform(corpus) # a sparse matrix vocab = vectorizer.get_feature_names() # a list print "vocabulary size:", len(vocab) # debug vocabulary fw = codecs.open("vocabulary.txt", "w", "utf-8") for word in vocab: fw.write(word + "\n") fw.close() return dtm, vocab
client = pymongo.MongoClient('mongodb://localhost:27017') db = client['yahoofinance_news'] news = list(db['news'].find({})) path = 'stanford-ner-2015-04-20/stanford-ner.jar' os.environ['STANFORD_MODELS'] = 'stanford-ner-2015-04-20/classifiers' st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', path, java_options='-mx2g') def find_orgs(token_tags): nes = groupby(token_tags, key=lambda d: d[1]) nes2 = [] for k, v in nes: if k == 'ORGANIZATION': nes2.append(' '.join([t[0] for t in v])) return nes2 doc_tokens = [wordpunct_tokenize(n['content']) for n in news] nes = map(find_orgs, st.tag_sents(doc_tokens)) nes = map(np.unique, nes) requests = [] for n, ne in zip(news, nes): requests.append(UpdateOne({'_id':n['_id']}, {"$set":{'nes':ne}})) db['news'].bulk_write(requests)
class NerAnalysis: def __init__(self): self.dict_of_dicts = {} self.tokenizer = nltk.tokenize.TweetTokenizer() config = configparser.ConfigParser() config.read("./config.ini") folder = config['NER']['stanford_ner_folder'] self.stanford_tagger = StanfordNERTagger( folder + '\classifiers\english.muc.7class.distsim.crf.ser.gz', folder + '\stanford-ner.jar', encoding='utf-8') def tag_text(self, text): sentences = nltk.sent_tokenize(text) tokenized_sentences = [ self.tokenizer.tokenize(sent) for sent in sentences ] classified_sentences = self.stanford_tagger.tag_sents( tokenized_sentences) list_to_return = [] for i in range(len(classified_sentences)): classified_sent = classified_sentences[i] sentence = sentences[i] result = self.process_sent(classified_sent, sentence) list_to_return.append(result) return list_to_return def tokenize_text(self, text): sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sentence_tokenizer.tokenize(text) return sentences def process_sent(self, classified_sentence, sentence): sentence_with_info = [] sentence_part = sentence # tokenization tokens = self.tokenizer.tokenize(sentence) num_of_tokens = len(classified_sentence) for index, token_with_tag in enumerate(classified_sentence): word = token_with_tag[0] ner_tag = token_with_tag[1] # if len(word) == 1 and not word.isalpha(): # pos_tag = "SYM" if ner_tag == 'O' or (ner_tag != 'O' and not word.isalpha()): if (index == num_of_tokens - 1): tup = sentence_part, None sentence_with_info.append(tup) continue token_position_final = None sentence_splitting = sentence_part current_len = 0 while True: token_position = sentence_splitting.find(word) # checking after found position good_after = False if (token_position + len(word) < len(sentence_splitting)): char_after_token = sentence_splitting[token_position + len(word)] if (char_after_token.isalpha() == False): good_after = True else: good_after = True # checking before found position good_before = False if token_position > 0: char_before_token = sentence_splitting[token_position - 1] if (char_before_token.isalpha() == False): good_before = True else: # nothing before the token, so it is a full word good_before = True if good_before and good_after: token_position_final = token_position + current_len break current_len = token_position + 1 sentence_splitting = sentence_splitting[(token_position + 1):] # at this point we have the position # "token_position_final" where the token was found # split_sentence is a list of parts # which are split around the found token split_sentence = sentence_part.split(word, 1) part_before = sentence_part[0:token_position_final] part_after = sentence_part[(token_position_final + len(word)):] # pre token part tup = part_before, None sentence_with_info.append(tup) # token and its tag tup = word, ner_tag sentence_with_info.append(tup) sentence_part = part_after return sentence_with_info
# print 'l:____________', type(l) # text = str(' '.join(l)) # print text # sent_list = tokenizer.tokenize(text) # print 'sentence list___________' # print sent_list # a = [sent.split() for sent in sent_list] # print 'aaaaaaaaaaaa' # print a # tagged_sents = st.tag_sents(a) # print tagged_sents begin = time.time() predicted_tagged_corpus = [] for tagged_body in tagged_bodies: body = word_list(tagged_body) sentences_list = tokenizer.tokenize(str(' '.join(body))) sent_list_splitted = split_sentence_list(sentences_list) predicted_tagged_body = st.tag_sents(sent_list_splitted) predicted_tagged_corpus.append(predicted_tagged_body) print predicted_tagged_corpus[0] print predicted_tagged_corpus[-1] print time.time() - begin, ' seconds!'
class EnPreprocesser(preprocesser.Preprocesser): def __init__(self, strip_accents="unicode", lowercase=True, remove_html=True, join_urls=True, use_bigrams=True, use_ner=True, stanford_ner_path="", use_lemmatizer=False, use_stemmer=False): self.stanford_ner_path = stanford_ner_path # path to stanford NER self.strip_accents = strip_accents # options: {‘ascii’, ‘unicode’, None} self.lowercase = lowercase self.remove_html = remove_html self.join_urls = join_urls self.use_bigrams = use_bigrams self.use_ner = use_ner self.use_lemmatizer = use_lemmatizer # use lemmatizer instead of stemmer? self.use_stemmer = use_stemmer # self.stanford_corenlp = StanfordCoreNLP(self.stanford_corenlp_path, memory="8g") self.sentence_splitter = PunktSentenceTokenizer( ).tokenize # Punkt sentence splitter self.stemmer = SnowballStemmer("english").stem # Snowball stemmer self.lemmatizer = WordNetLemmatizer().lemmatize # WordNet lemmatizer self.base_tokenizer = CountVectorizer().build_tokenizer( ) # sklearn tokenizer works the best, I think... self.stop_words = stopwords.words( "english") # nltk list of 128 stopwords self.token_pattern = re.compile( r"(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b" ) # default value was r"(?u)\b\w\w+\b" self.numeric_pattern = re.compile(r"^[0-9]+$") # number regex self.url_pattern = re.compile(r"((http://)?(www\..*?\.\w+).*?)\s") self.compound_pattern = re.compile(r"\w+(\-\w+)+") if self.use_lemmatizer: self.tokenizer = CustomTokenizer(self.base_tokenizer, self.lemmatizer, self.token_pattern, self.numeric_pattern) elif self.use_stemmer: self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer, self.token_pattern, self.numeric_pattern) else: self.tokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x, self.token_pattern, self.numeric_pattern) def find_nbest_bigrams(self, corpus, n, metric, min_freq): """ Find the top-N most frequently occurring bigrams within the corpus. """ print("\nfinding top-%d bigrams using %s..." % (n, metric)) alltokens = [] simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x, re.compile(".*"), re.compile("^$")) for doc in corpus: for token in [t for t in simplerTokenizer(doc)]: alltokens.append(token) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(alltokens) finder.apply_freq_filter( min_freq) # bigrams must appear at least 5 times if metric.lower() == "pmi": best_bigrams = finder.nbest(bigram_measures.pmi, n) # doctest: +NORMALIZE_WHITESPACE elif metric.lower() == "chi_sq": best_bigrams = finder.nbest(bigram_measures.chi_sq, n) # doctest: +NORMALIZE_WHITESPACE else: raise Exception("Unknown metric for bigram finder") return best_bigrams def remove_punctuation(self, text): """ Remove punctuation. """ return "".join(re.findall(r"[a-zA-Z0-9\s]+", text)) # return "".join(re.findall(r"[a-zA-Z0-9,.;!:'?\s]+", tokens)) # return tokens def tag_corpus_ner(self, corpus): """ Tag named entitties in corpus with stanfordNER toolkit """ if not hasattr(self, "stanford_ner"): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) # import imp self.stanford_ner = StanfordNERTagger( self.stanford_ner_path + "classifiers/english.conll.4class.distsim.crf.ser.gz", self.stanford_ner_path + "stanford-ner.jar") self.stanford_ner._stanford_jar = self.stanford_ner_path + "stanford-ner.jar:" + self.stanford_ner_path + "lib/*" print("splitting sentences in corpus (for NER)...") corpus_sentences = [] sentence_to_doc_map = {} sent_no = 0 for d in tqdm(range(len(corpus))): # print("\r%s " % d, end="") for sent in self.sentence_splitter(corpus[d]): corpus_sentences.append(sent) sentence_to_doc_map[sent_no] = d sent_no += 1 tokenized_sentences = [] for sent in corpus_sentences: tokenized_sentences.append( [t for t in re.split(r"\s+", sent) if len(t) > 0]) #tokenized_sentences = [re.split(r'\s+', sent) for sent in corpus_sentences] print("tagging sentences with Stanford NER...") tagged_sentences = [] for batch in tqdm(range(self.ner_batch)): # print("\r%s/%s tagging sentences with Stanford NER..." % (batch, self.ner_batch), end="") chunk = int(len(corpus) / self.ner_batch) tagged_sentences += self.stanford_ner.tag_sents( tokenized_sentences[batch * chunk:(batch + 1) * chunk]) # process NER output tagged_corpus = [] current_doc_no = 0 current_doc = [] for i in range(len(tagged_sentences)): doc_no = sentence_to_doc_map[i] if doc_no == current_doc_no: current_doc += tagged_sentences[i] else: tagged_corpus.append(current_doc) current_doc = [] current_doc_no = doc_no tagged_corpus.append(current_doc) # get dictionary of named entities per document named_entities = [] for tagged_doc in tagged_sentences: tags = {} current_ne = [] for token, tag in tagged_doc: if current_ne: if tag == "O" or (tag != "O" and tag != current_ne[-1][1]): tags[" ".join([t for t, _ in current_ne ])] = current_ne[0][1] current_ne = [] if tag != "O": current_ne.append((token, tag)) if current_ne: tags[" ".join([t for t, _ in current_ne])] = current_ne[0][1] named_entities.append(tags) return tagged_sentences, named_entities def preprocess_corpus(self, corpus): """ Preprocess the corpus. """ self.ner_batch = int(math.ceil(len(corpus) / 5000)) print("preprocessing corpus...") print("corpus size: %i, ner_batch=%i" % (len(corpus), self.ner_batch)) # first pass over the corpus: prepare for NER print("first pass over the corpus...\n\tunescape characters") if self.remove_html: print("\tremove html") if self.strip_accents: print("\tstrip accents") if self.join_urls: print("\tjoin URLs") print("\tjoin compound words\n\tspace out punctuation") for d in tqdm(range(len(corpus))): corpus[d] = html.unescape(corpus[d]) + " " if self.remove_html: corpus[d] = self.remove_html_tags(corpus[d]) if self.strip_accents == "unicode": corpus[d] = self.strip_accents_unicode(corpus[d]) if self.join_urls: corpus[d] = self.join_urls_to_token(corpus[d], self.url_pattern) corpus[d] = self.join_compound_words(corpus[d], self.compound_pattern) corpus[d] = self.space_out_punctuation(corpus[d]) # print("\r\t%s" % d, end="") if self.use_ner: tagged_corpus, named_entities = self.tag_corpus_ner(corpus) # debug NER fw = codecs.open("debug_NER.txt", "w", "utf-8") for tags in named_entities: fw.write("%s\n" % list(tags.items())) fw.close() print("\nmerging named entities as single tokens...") for d in tqdm(range(len(tagged_corpus))): tags = named_entities[d] for ne in tags: corpus[d] = corpus[d].replace(ne, re.sub(r"\s+", "", ne)) # print("\r%s " % d, end="") # second pass over the corpus: remove punctuation and convert to lowercase # (these were useful above for NER, but now can be removed) print("\nsecond pass over the corpus...") if self.lowercase: print("\tconvert to lowercase") print("\tremove punctuation") for d in tqdm(range(len(corpus))): corpus[d] = self.remove_punctuation(corpus[d]) if self.lowercase: corpus[d] = corpus[d].lower() # print("\r\t%s" % d, end="") if self.use_bigrams: # find top N bigrams # best_bigrams = self.find_nbest_bigrams(corpus, 100, "pmi", 10) best_bigrams = self.find_nbest_bigrams(corpus, 100, "chi_sq", 10) # debug bigrams fw = codecs.open("debug_bigrams.txt", "w", "utf-8") for w1, w2 in best_bigrams: fw.write(w1 + " " + w2 + "\n") fw.close() print("\n") for d in range(len(corpus)): print("\r%s merging bigrams as single tokens..." % d, end="") for w1, w2 in best_bigrams: corpus[d] = corpus[d].replace(w1 + " " + w2, w1 + w2) return [sent for sent in corpus] # helper functions def strip_accents_unicode(self, text): return "".join([ c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c) ]) def remove_html_tags(self, text): return re.sub(r"( ?\.+ )+", " . ", re.sub(r"<[^>]*>", " . ", text)) def join_urls_to_token(self, text, url_pattern): m = re.search(url_pattern, text) while m: text = re.sub(url_pattern, m.group(3).replace("http://", "").replace(".", ""), text) m = re.search(url_pattern, text) return text def join_compound_words(self, text, compound_pattern): m = re.search(compound_pattern, text) while m: text = re.sub(m.group(0), m.group(0).replace("-", ""), text) m = re.search(compound_pattern, text) return text def space_out_punctuation(self, text): text = re.sub(r",\s", " , ", text) text = re.sub(r"\.\.\.\s", " ... ", text) text = re.sub(r"\.", " . ", text) text = re.sub(r";\s", " ; ", text) text = re.sub(r":\s", " : ", text) text = re.sub(r"\?\s", " ? ", text) text = re.sub(r"!\s", " ! ", text) text = re.sub(r"\"", " \" ", text) text = re.sub(r"\'", " \' ", text) text = re.sub(r"\s\(", " ( ", text) text = re.sub(r"\)\s", " ) ", text) text = re.sub(r"\s\[", " [ ", text) text = re.sub(r"\]\s", " ] ", text) text = re.sub(r"-", " - ", text) text = re.sub(r"_", " _ ", text) text = re.sub(r"\n", " ", text) text = re.sub(r"\r", " ", text) text = re.sub(r"\s+", " ", text) tokens = self.tokenizer(text) tokens = " ".join(tokens) return tokens
# for each line from stdin for line in stdin: try: # load json-tweet tweet = loads(line) tweetText = tweet['text'] # tokenize tweet-text listOfWords = word_tokenize(tweetText) listOfListOfWords.append(listOfWords) except: pass # StandfordNER Instance nerClf = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') nerPair = nerClf.tag_sents(listOfListOfWords) # word is location and greater than 2 character locations = [] for ner in nerPair: for word, nerType in ner: if nerType == 'LOCATION' and len(word) > 2: locations.append(word.lower()) for location in locations: print((location, frequency))
if originalSize < 5000000 and originalSize > 1000: sentences = sent_tokenize(text) for sent in sentences: if (re.search('([A-Z]\w+ [1-9]*(1[0-9])*(2[0-9])*(3[0,1])*, \d{4})', sent))and is502: sentences_with_date.append(sent) words = pos_tag(word_tokenize(sent)) for word in words: if word[1] == "VB": sets = wn.synsets(word[0]) for s in sets: for w in wordList: if w.path_similarity(s) > 0.3: relevant_sents.append(sent) sents = st.tag_sents([word_tokenize(sent) for sent in sentences]) for classedSent in sents: for word in classedSent: if'PERSON' in word[1] and not previousPerson: i+=1 names.append(word[0]) previousPerson = True elif 'PERSON' in word[1]: names[i] = names[i]+" " + word[0] previousPerson = True else: previousPerson = False elif originalSize > 1000: chunks = (text[0+i:10000+i] for i in range(0, len(text), 10000)) for chunk in chunks: sentences = sent_tokenize(chunk)
if ("in a " in str(sentence)): string = str(sentence) index = string.index("in a ") + len("in a ") string = string[index:] word = nltk.word_tokenize(string)[0] places.append(word) for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))): if (pos == 'NNS'): if (word not in entities): entities.append(word) if ('VB' in pos): if (word not in verbs): verbs.append(word) classified_text = st.tag_sents(tokenized_sents) for item in classified_text: for x, y in item: if (y == 'PERSON'): if (x not in persons): persons.append(str(x)) print(entities) print(len(entities)) print(persons) print(len(persons)) outfile = open("persons", "wb") pickle.dump(entities, outfile) outfile.close()
class EmailGraph: #http://py2neo.org/2.0/intro.html#nodes-relationships #Creates a New Graph (You will Need to Update this Function for your own install) def __init__(self, user, pwrd): authenticate("localhost:7474", user, pwrd) self.graph = Graph("http://localhost:7474/db/data/") java_path = "C:\ProgramData\Oracle\Java\javapath\java.exe" os.environ['JAVAHOME'] = java_path self.st = StanfordNERTagger('C:\stanford-ner-2015-12-09\classifiers\english.conll.4class.distsim.crf.ser.gz',\ 'C:\stanford-ner-2015-12-09\stanford-ner.jar') self.stop_words = nltk.corpus.stopwords.words('english') self.legal_words = {"section","fw","re","ops","fyi","doc no","case no","subtitle","btw","usc","foia","chapter","u.s.c",\ "report","attachment","attachments","note","amended", "ebook","subject","unclassified department of state case","doc",\ "unclassified u.s. department of state","original message","project", "copyright", "pls", "you","u.s. department of state case no"} #process email: removes some of the headings before looking for keywords def process_email(self, email): processed = "" for line in email.split('\n'): s = line.lower() if s.startswith("unclassified u.s. department of state") or \ s.startswith("release in") or \ s.startswith("original message") or \ s.startswith("to:") or \ s.startswith("from:") or \ s.startswith("sent:") or \ s.startswith("cc:"): pass else: if len(line) > 0 and line[-1] == '.': processed = processed + line + ' ' else: processed = processed + line + '. ' return processed #filter_by_contents: receives a list of noun_phrases and filters out phrases contained in longer phrases elsewhere in the list def filter_by_contents(self, noun_phrases): in_others = [] for i, candidate in enumerate(noun_phrases): for j, other in enumerate(noun_phrases): if i != j: if candidate[0].lower() in other[0].lower() and candidate[ 0] != other[0]: #compare each phrase with another in_others.append(candidate) #filter out our identified 'duplicate' words and stopwords. filtered_words = [w for w in noun_phrases if w not in in_others and \ w[0].lower() not in self.legal_words and w[0].lower() not in self.stop_words] #create a Frequency Distribution unigram_fd = nltk.FreqDist(filtered_words) #get the most common phrases common_noun_phrases = unigram_fd.most_common(20) result = [] words = set([w[0][0].lower() for w in common_noun_phrases]) for w in words: best_match = None for phrase in common_noun_phrases: if phrase[0][0].lower() == w: if best_match is None: best_match = phrase else: best_match = (best_match[0], best_match[1] + phrase[1]) result.append(best_match) return sorted([w for w in result], key=lambda w: w[1], reverse=True) #filter_by_hypernym: receives a list of candidates and finds the best hypernym for each. #I started with code by Anna Swigart, ANLP 2015, and her concept of using a dictionary to store #terms from WordNet, however this code drastically departs from her algorithm. def filter_by_hypernym(self, candidates): #create a dictionary results = [] for term in candidates: #loop through list of candidates synsets = wn.synsets(term[0][0], 'n') #obtain the synsets for the phrase if len(synsets) >= 1: hypers = synsets[0].hypernyms( ) + synsets[0].instance_hypernyms() if len(hypers) >= 1: results.append(((term[0][0], hypers[0].name().split('.')[0]), term[1])) else: results.append(term) else: results.append(term) return results #algorithm for extracting key words from an email body def final_algorithm(self, email): #Create Sentences sentences = nltk.sent_tokenize((self.process_email(email))) tokenized_sentences = [] for s in sentences: #get the tokens for each sentence that are filtered tokenized_sentences.append([word for word in nltk.word_tokenize(s) \ if not re.search('[0-9]', word) and word.lower() not in self.legal_words and len(word) > 2]) #separate the NER tagged entities from the rest def get_entities(tags): result = [] curr = [] for ent in tags: if ent[1] == 'O': if len(curr) > 0: result.append(curr) curr = [] else: if len(curr) > 0: if not curr[0][1] == ent[1].lower(): result.append(curr) curr = [(ent[0], ent[1].lower())] else: curr = curr + [(ent[0], ent[1].lower())] else: curr = [(ent[0], ent[1].lower())] return result #NER tag each of the sentences tagged_sents = self.st.tag_sents(tokenized_sentences) entity_names = [] for s in tagged_sents: entity_names = entity_names + get_entities(s) #reorganize the entities for further processing def compress_entities(entities): new_list = [] for entity in entities: result = " ".join([w[0] for w in entity]) new_list.append((result, entity[0][1])) return new_list entity_names = compress_entities(entity_names) #print(entity_names) # Print unique entity names noun_phrases = entity_names #Candidates Filtered by Duplicate Nouns and Rescored by Length noun_phrases = self.filter_by_contents(noun_phrases) #print(noun_phrases) #Candidate with better categories/hypernyms! noun_phrases = self.filter_by_hypernym(noun_phrases) #print("Email:\n" + email) print("Key Phrases:\n" + str(noun_phrases)) return noun_phrases #clears out a graph def delete(self): self.graph.delete_all() #checks to see if a node exists in a graph #http://stackoverflow.com/questions/22134649/how-to-check-if-a-node-exists-in-neo4j-with-py2neo def find_existing(self, label, key, value): mynode = list( self.graph.find(label, property_key=key, property_value=value)) # node found if len(mynode) > 0: return mynode[0] # no node found else: return None #adds a new 'email' data element to the graph #code based on http://py2neo.org/2.0/intro.html#nodes-relationships def add_to_graph(self, data_element, terms): #['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'Metadata From', #'MetadataDateSent', 'ExtractedSubject', 'ExtractedTo', #'ExtractedFrom', 'ExtractedBodyText','RawText', 'Label']] email_id = data_element['DocNumber'] email_feeling = data_element['NewLabel'] email = self.find_existing("Email", "docid", email_id) if email is None: if str(email_feeling) == '1': email_feelstr = 'emotional' n = 'E' else: email_feelstr = 'neutral' n = 'N' email = Node("Email", name = n, docid = email_id, tone=email_feelstr,\ subject=data_element["ExtractedSubject"], date=data_element['MetadataDateSent']) s = email #add From nodes from_id_all = data_element['ExtractedFrom'] if type(from_id_all) is str: for from_id_i in from_id_all.split(';'): from_id = from_id_i.strip().strip('\'') sender = self.find_existing("User", "address", from_id) if sender is None: sender = Node("User", address=from_id) s = s | Relationship(sender, "SENT", email) #add To nodes to_id_all = data_element['ExtractedTo'] if type(to_id_all) is str: for to_id_i in to_id_all.split(';'): to_id = to_id_i.strip().strip('\'') receiver = self.find_existing("User", "address", to_id) if receiver is None: receiver = Node("User", address=to_id) s = s | Relationship(receiver, "RECEIVED", email) #add Emotion nodes emote_all = data_element['Emotions'] #print(emote_all) if type(emote_all) is str: print("Emotions: " + str(emote_all)) for emote in emote_all.split(';'): if len(emote) > 0: emotion = self.find_existing("Emotion", "name", emote) if emotion is None: emotion = Node("Emotion", name=emote) s = s | Relationship(email, "EMOTED", emotion) self.graph.create(s) #add keywords and categories for item in range(0, len(terms)): keyword = terms[item][0][0] category = terms[item][0][1] n = self.find_existing("Keyword", "name", keyword) if n is None: n = Node("Keyword", name=keyword) s = Relationship(email, "MENTIONS", n) c = self.find_existing("Category", "name", category) if c is None: c = Node("Category", name=category) s = s | Relationship(n, "IS_TYPE_OF", c) self.graph.create(s) #get_random_emails - returns a number of random emails from a given data frame def get_random_emails(self, data_set, number): random_index = np.random.permutation(data_set.index) full_data_shuffled = data_set.ix[random_index,\ ['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'Metadata From', 'MetadataDateSent',\ 'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom','ExtractedBodyText','RawText',\ 'NewLabel', 'Emotions']] full_data_shuffled.reset_index(drop=True, inplace=True) #separate the training data from the development data return full_data_shuffled.loc[0:number - 1] #adds a specified number of emails from a dataset def add_new_emails(self, num, total_df): selected_emails = self.get_random_emails(total_df, num) selected_emails["MetadataDateSent"].fillna(value='<blank>', inplace=True) selected_emails["ExtractedSubject"].fillna(value='<blank>', inplace=True) data_list = selected_emails["RawText"].values.tolist() subject_list = selected_emails["ExtractedSubject"].values.tolist() printable = set(string.printable) #for each email, extract the key words and then add to the graph for index in range(0, num): s = "".join(filter(lambda x: x in printable, data_list[index])) + ' . ' +\ "".join(filter(lambda x: x != '<blank>' and x in printable, subject_list[index])) terms = self.final_algorithm(s) self.add_to_graph(selected_emails.loc[index], terms)