def send(web): #function returns list of noun pharse sentence content = content_extract(web) n = NER.data(content) return n
def main(): text = DocToText("test.doc") x = NER.chunk_NER(text) print x info = regex_info_extractor(text)
def q3(query, k=3): """ Returns the top k entities associated with a query. Parameters ---------- query: String The query to be passed to obtain documents. k: int The number of top entities to be returned. Returns ------- List of strings """ if k is None: k = 3 else: k = int(k) with open("data/search_engine_data.txt", "rb") as f: s_e = pickle.load(f) query_tokens = s_e.filtered_tokenize(query) ids = s_e.get_matches_OR(query_tokens) # raises an exception if the query is not found in any of the documents if len(ids) == 0: return None # raise Exception("Sorry, no matches were found with the query \' " + q + " \' ") tokens = [s_e.unfiltered_tokens[id] for id in ids] return NER.common_entities(tokens, k=k)
def gen_examples(): """ Generate train, dev, test examples from data. Extract news articles only belonging to topics 'World', 'Entertainment', 'Sports', and 'Business'. Retain only those examples that contain an instance of one of the top 5 most frequent named entities. """ examples = [] temp = [] categories = ['World', 'Entertainment', 'Sports', 'Business'] cat_label = {x: i for i, x in enumerate(categories)} tree_root = get_data() for child in tree_root: if child.tag == 'title' or child.tag == 'category' or child.tag == 'description': temp.append(child.text) if child.tag == 'pubdate': if len(temp) == 3: if temp[1] in categories: if temp[0] is not None and temp[2] is not None: X = temp[0] + " " + temp[2] X_processed = unescape_chars(X) Y = cat_label[temp[1]] ex = Example(X_processed, Y) examples.append(ex) temp = [] new_examples = NER.ne_extract(examples, top=5) train, dev, test = train_test_split(new_examples) return train, dev, test
def crossValidate(n=10, multi=True, threshold=[0.262, 0.880], NEtype="location"): """perform n-fold cross validation on the data, in order to optimize the threshold parameters and maximize F1 score for the various models.""" training_2016 = '../data/2016/data/train' training_2017 = '../data/2017/wnut17train.conll' data_split = split_data(training_2017, n) result = defaultdict(defaultdict) #load in the _exdeflike_ and _indeflike_ data with open('exdeflike.txt', 'rb') as infile: total_exdef = pickle.load(infile) with open('indeflike.txt', 'rb') as f: total_indef = pickle.load(f) exdeflikes = [None] * n indeflikes = [None] * n for fold in total_exdef: exdeflikes[int(fold)] = total_exdef[fold] indeflikes[int(fold)] = total_indef[fold] #put the indefs and exdefs in lists with the same indices for i, data in enumerate(data_split): #need to perform testing n times, giving each of the n partitions #of the data a chance to be the test data and the rest training duplicate = list(data_split) #don't want to modify the original list test = duplicate.pop(i) #we split up runTest and evaluate print "performing " + str(i) + " fold" testData = NER.runTest(test, exdeflikes[i], indeflikes[i], multi=multi, NEtype=NEtype) result[str(i)] = NER.evaluate(testData, threshold, NEtype=NEtype) sort_result = sortDict(result) with open('2017plus_' + NEtype + '.result', 'w') as outfile: json.dump(sort_result, outfile, sort_keys=True, indent=4) return sort_result
def filter_basic(candidates): import NER matches = [] for word in candidates: if re.match("([A-Z]\w+ )+[0-9]\w+", word): matches.append(word) return NER.filter_part_dates(matches)
def __init__(self): self.questions = None self.predictions = [] self.para_searcher = paragraphSearch.ParaSearcher() self.sent_searcher = paragraphSearch.SentSearcher() self.classifier = questionClassifier.QuestionClassfier() self.ner_tagger = NER.NERTagger() self.extracter = answerExtracter.AnswerExtracter() self.cached_doc = []
def evaluate_speakers(fileName, speakers, notspeakers, acnames): text = Code.get_text(fileName) #acnames = NER.extract_tagged_names(text) fp = 0.0 fn = 0.0 tn = 0.0 tp = 0.0 for name in acnames: if name in speakers: tp = tp + 1 else: fn = fn + 1 for name in speakers: #print name #ADJUSTED if not name in acnames: found = False for acname in acnames: if NER.contains_part_name(name, acname): found =True break if not found: fp = fp + 1 for name in notspeakers: if not name in acnames: tn = tn + 1 acc=0 if (tp + tn + fp + fn) >0: acc = (tp + tn) / (tp + tn + fp + fn) #if both are empty we have 100% accuracy if acnames == speakers and acnames == []: acc = 1 precision = 0 if len(speakers) > 0: precision = tp / len(speakers) recall = 0 if len(acnames) >0: recall = tp / len(acnames) if (precision + recall == 0): f1 = 0 else: f1 = 2*(precision * recall /(precision + recall)) print "evaluation of file " + fileName print "accuracy: " + str(acc) print "precision: " + str(precision) print "recall: " + str(recall) print "f1: " + str(f1) return (acc, precision, recall, f1)
def loadExtra(): """this function will output the list _records_ of the supplementary internal model named entity data that we are using. here we will also pick and choose which of these files to apply the acronymization to""" raw_records = [] #create a list of all the filenames in the lexicon data folder filenames = glob.glob('../data/lexicon/*.*') tuple_data = [] for filename in filenames: with open(filename, 'r') as f: records = re.split("\n", f.read().strip()) for record in records: data = record.split() sublist = [] for i, token in enumerate(data): if not i: #if the word is first, it must have tag B sublist.append((token, "B")) else: sublist.append((token, "I")) tuple_data.append(sublist) #now we load the add in the acronym data acronyms = glob.glob('../data/acronyms/*.*.*') for acronym in acronyms: with open(acronym, 'r') as filename: records = re.split("\n", filename.read().strip()) for record in records: sublist = [] if record: token = record.split()[-1] #we just want the acronym sublist.append((token, "B")) #always a single word tuple_data.append(sublist) #train the write the data to disk NER.trainExtra(tuple_data)
def sharednams(): print("abc".upper() == "ABC") print NER.filtermonths(["jan", "Jan", "January", "word"]) names1 = NER.print_names_file("/home/james/Desktop/India.txt") names2 = NER.print_names_file('/home/james/Desktop/Tintin.txt') names11 = [] names22 = [] for name in names1: for part in name.split(" "): names11.append(part) for name in names2: for part in name.split(" "): names22.append(part) both = set(names11) & set(names22) print "Common names:" for x in both: print x
def has_building_words(candiates): import NER #so happy someone spells centre correctly! now I can add it to my list of building words and its useful (email 284 btw) building_words = [ "room", "building", "hall", "auditorium", "wing", "floor", "center", "centre", "school", "theater", "theatre", "library", "university", "tower", "college", "institute", "avenue" ] #add capitalised versions as well building_words.extend(NER.name_capitalise(building_words)) contain = [] for line in candiates: for word in building_words: if word in line: contain.append(line) break return contain
def q2(ent, k=3): """ Returns the top k entities associated with ent. Parameters ---------- ent: String The entity to be passed as a query. k: int The number of top entities to be returned. Returns ------- List of strings """ if k is None: k = 3 else: k = int(k) return NER.top_related(ent.lower(), k=k)
def get_all_locations(text): firstcandidates = [] candidates = [] final = [] basics = [] firstlist = re.findall( r'(((([A-Z]\w+)|1st|2nd|3rd|4th|5th|6th|(r|R)o(o|m)m|[0-9]+[A-Z]*)((,? (((r|R)o(o|m)m|in) )?)(([A-Z]\w+|1st|2nd|3rd|4th|5th|6th|hall|Hall|[0-9]+[A-Z]*)))+)(\n\s?((([A-Z]\w+)|1st|2nd|3rd|4th|5th|6th|(r|R)o(o|m)m|[0-9]+[A-Z]*)((,? (((r|R)o(o|m)m|in) )?)(([A-Z]\w+|1st|2nd|3rd|4th|5th|6th|hall|Hall|[0-9]+[A-Z]*)))*))*)', text, re.MULTILINE) bas = (re.findall("(([A-Z]\w+ )+[0-9]\w+)", text)) if not bas == []: for match in bas: basics.append(match[0].strip()) final.extend(NER.filter_part_dates(basics)) for match in firstlist: #print(match[0]) #raw_input() firstcandidates.append(match[0].strip()) #strip one word line off for candidate in firstcandidates: #print(candidate) #print(re.findall(r'\n\s?\w+(\s|\n)?$', candidate)) #raw_input() if (re.findall(r'\n\s?\w+(\s|\n)?$', candidate) == ['']): candidates.append(re.sub(r'\n\s?\w+(\s|\n)?', "", candidate)) else: candidates.append(candidate) for match in has_building_words(candidates): final.append(match.strip()) for match in filter_basic(candidates): final.append(match.strip()) #final.extend(place_field(text)) return set(final)
def process_block(self, text, terms, block_type, term_doc_count_dict, total_length, silver_query): pos_tags = pos.get_pos_tags(terms) entity_words = set() if self.use_ner: entity_words = ner.get_entities(text) size = 10 # prev_prev_features = [0] * size prev_features = [0] * size next_features = [0] * size # nex_next_features = [0] * size for i, (term, pos_tag) in enumerate(zip(terms, pos_tags)): features = self.process_word(i, term, block_type, pos_tag, entity_words, term_doc_count_dict, total_length) # if i > 1: # prev_prev_features = self.process_word(i-2, terms[i-2], block_type, pos_tags[i-2], entity_words, term_doc_count_dict, total_length) if i > 0: prev_features = self.process_word(i - 1, terms[i - 1], block_type, pos_tags[i - 1], entity_words, term_doc_count_dict, total_length) if i < len(terms) - 1: next_features = self.process_word(i + 1, terms[i + 1], block_type, pos_tags[i + 1], entity_words, term_doc_count_dict, total_length) # if i < len(terms) - 2: # nex_next_features = self.process_word(i+2, terms[i+2], block_type, pos_tags[i+2], entity_words, term_doc_count_dict, total_length) if self.useContext: features = prev_features + features + next_features is_in_doc = int( term in anserini.tokenizeString(silver_query, 'lucene')) self.add_sample(is_in_doc, features)
def clear_database(): NER.clear_database()
print 'NER List:', NER_list""" master_person_list = [] master_org_list = [] master_loc_list = [] master_month_list = [] master_time_list = [] master_money_list = [] master_percent_list = [] master_prof_list = [] for i in range(0, len(sentences_list)): sentences_list[i] = sentences_list[i].replace(",", "").replace(".", "").replace("!", "") # sentences_list[i]=sentences_list[i].replace("'",'"') sent_person_list, sent_org_list, sent_loc_list, sent_month_list, sent_time_list, sent_money_list, sent_percent_list, sent_prof_list = NER.named_entity_recognition( sentences_list[i] ) master_person_list.append(sent_person_list) master_org_list.append(sent_org_list) master_loc_list.append(sent_loc_list) master_month_list.append(sent_month_list) # month and weekday names + season names master_time_list.append(sent_time_list) master_money_list.append(sent_money_list) master_percent_list.append(sent_percent_list) master_prof_list.append(sent_prof_list) ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW ######################## who_list, what_list, when_list, why_list, where_list, how_list = [], [], [], [], [], [] for i in range(0, len(cleansedqList)):
def addNewlines(fin, fout): fin = os.path.abspath(fin) fout = os.path.abspath(fout) text = NER.getText(fin) txt = re.sub(r"(^[.|?|!].*)",'\\1\n',text) NER.writeText(fout, txt)
def get_data(): NER.db = NER.retrieve_database() return NER.db
def answering_who(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_person_list,sent_prof_list): # Declaring globals to be used in this function sent_score_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] temp_q=cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Temp_q: ',temp_q q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q) for i in range(0, len(complete_sentence_list)): #print 'Sentence is :', complete_sentence_list[i] score=0 # 1. Score using word match rule. Match words in question with the words in stop free sentence #print 'Sentence is :',sentence_list[i] score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) # 2. If question does not contain name but the answer contains NAME then you are confident(+6) if q_person_list==[]: #Giving more weights to sentences having more names in it if sent_person_list[i] !=[] or sent_prof_list[i] !=[]: #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list) score=score + 6 # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4) lmtzr = WordNetLemmatizer() temp= complete_sentence_list[i].split() for k in range(0,len(temp)): if lmtzr.lemmatize(temp[k].lower())=='name': score=score + 4 # 4. Awards points to all sentences that contain a name or reference to a human if sent_person_list[i] !=[] or sent_prof_list[i] !=[]: #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list) score=score+4 # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question # then it is a confident clue and we reward it more sent_pos_list= POS_Tagging.pos_tagging(complete_sentence_list[i]) '''for m in range(0, len(sent_pos_list)): if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split(): score=score + 18 #print 'Score now is :', score''' for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 # 6. If the question contains a profession name, the answer has to be a person and sentence would have #the person name and the profession if q_prof_list!=[]: for k in complete_sentence_list[i].split(): if k.lower() in q_prof_list: #print 'Profession Yes !' score=score+18 else: #Question contains name so the chances of answer being a profession name are decent if sent_prof_list[i] !=[]: score=score+6 sent_score_list.append(score) #print 'Sent score list is :',sent_score_list # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first candidate_list=[] npfinal_list=[] temp_list=[] answer_list=[] max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value for i in range(0, len(complete_sentence_list)): if sent_score_list[i]==max_score_value: candidate_list.append((complete_sentence_list[i],i)) #print 'Candidate list is :',candidate_list #If there is only one sentence, then choose the sentence and then do the processing to display the answer if len(candidate_list)==1: temp_str= candidate_list[0][0] index=candidate_list[0][1] #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): #Cleaning up the candidate sentence temp_str=candidate_list[k][0] index =candidate_list[k][1] #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') break ####################### SENTENCE PROCESSING TO FIND THE ANSWER ############################### #Just pick out the noun-phrase or PERSON names from the sentence #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str) s_plist=sent_person_list[index] s_proflist=sent_prof_list[index] #print 'Prof list is:',s_proflist #If the question has a name of person, then the answer sentence should/would most probably #the name of a person but it should not be the name of the person appearing in the question. #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases #print 'Question person list is:',q_person_list #print 'Sentence person list is:',s_plist result_list=[] q_loc_who_list=[] if q_person_list==[] and s_plist==[]: #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str) if pos_np_list != []: for x in pos_np_list: if x not in temp_q and x[0].isupper(): #Noun phrases or names generally start with an upper case character print 'First character caps',x result_list.append(x) return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q: result_list.append(k) return ' '.join(result_list) elif q_person_list !=[] and s_plist !=[]: #To counter situations when both question and sentence has names Ex. Who defeated who ? for k in s_plist: if k not in temp_q: answer_list.append(k) elif q_person_list==[] and s_plist !=[]: for i in range(0, len(s_plist)): if s_plist[i] not in q_person_list and s_plist[i] not in temp_q: #To counter situations where question has a name and NER doesn't identify it answer_list.append(s_plist[i]) elif q_person_list != [] and s_proflist !=[]: #To counter situations for 'Who is X' type questions which could have a profession name in the answer for k in s_proflist: answer_list.append(k) elif q_person_list==[] and q_loc_list !=[]: # Who is <X> where ? #print 'Question has no name but has a location' for k in temp_str.split(): if k not in temp_q: q_loc_who_list.append(k) if q_loc_who_list !=[]: return ' '.join(q_loc_who_list) '''elif q_person_list==[] and s_proflist !=[]: for k in s_proflist: answer_list.append(k)''' if answer_list != [] :#and flag==1: #Indicating candidate sentence has a name other than that in question result= ' '.join(answer_list) else: #Pick out the noun phrase or nouns and then display them as answer np_list = POS_Tagging.pos_noun_tagging(temp_str) for x in np_list : if x not in temp_q: npfinal_list.append(x) #Removing all occurences of existing noun phrases from the question #print 'NP Final list after removal is',npfinal_list if npfinal_list !=[]: result=' '.join(npfinal_list) else: result=temp_str # Printing out the whole sentence #print 'Result is:',result return result
master_money_list = [] master_percent_list = [] master_prof_list = [] #print 'Sentence list is:',sentences_list for i in range(0, len(sentences_list)): temp_str = sentences_list[i] '''sentences_list[i]=sentences_list[i].strip() if sentences_list[i].index(',') != -1: if sentences_list[i][sentences_list[i].index(',')+1]!=' ': sentences_list[i]=sentences_list[i].replace(',','').replace('!','')''' temp_str = temp_str.strip() temp_str = temp_str.replace(',', '').replace('!', '') #sentences_list[i]=sentences_list[i].replace("'",'"') sent_person_list, sent_org_list, sent_loc_list, sent_month_list, sent_time_list, sent_money_list, sent_percent_list, sent_prof_list = NER.named_entity_recognition( temp_str) master_person_list.append(sent_person_list) master_org_list.append(sent_org_list) master_loc_list.append(sent_loc_list) master_month_list.append( sent_month_list) #month and weekday names + season names master_time_list.append(sent_time_list) master_money_list.append(sent_money_list) master_percent_list.append(sent_percent_list) master_prof_list.append(sent_prof_list) ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW ######################## who_list,what_list,when_list,why_list,where_list,how_list=[],[],[],[],[],[] for i in range(0, len(cleansedqList)):
#! /usr/bin/env python import NER import sys import os if __name__=="__main__": fin = os.path.abspath(sys.argv[1]) fout = os.path.abspath(sys.argv[2]) NER.bwtoutf8(fin,fout)
def part_one(fileList, myList): #paragraph scores totPacc, totPprec, totPrec, totPf = 0, 0, 0, 0 #sentence scores totSacc, totSprec, totSrec, totSf = 0, 0, 0, 0 #time scores totTacc, totTprec, totTrec, totTf = 0, 0, 0, 0 #name scores totNacc, totNprec, totNrec, totNf = 0, 0, 0, 0 #location scores totLacc, totLprec, totLrec, totLf = 0, 0, 0, 0 docs =[] classes = set() #because NER requires training a tagger it is best done as a #batch process rather than repeatedly calling a function, #therefore it is done here and the results processed later if (len(fileList) > 10): print("running NER on " + str(len(fileList)) + " files (might take a bit)") else: print("running NER") namesdict = NER.extract_names_files(fileList) numdict = {} for i in range (0, len(fileList)): untaggedfile = fileList[i] taggedfile = fileList[i].replace("un", "") text = get_text(untaggedfile) taggedtext = get_text(taggedfile) acnames = NER.collapse_names(NER.extract_tagged_names(taggedtext)) nameset = NER.collapse_names(namesdict[fileList[i]]) numnames = len(nameset) if numnames in numdict: numdict[numnames] = numdict[numnames] + 1 else: numdict[numnames] = 1 print(numdict ) for i in range (0, len(fileList)): untaggedfile = fileList[i] taggedfile = testList[i] mytaggedfile = myList[i] text = get_text(untaggedfile) taggedtext = get_text(taggedfile) mytext = "" body = (text.split('Abstract:')[1]) fileName = untaggedfile #prepare for training by adding this emails class to set #0 = fname, 1 = class, 2 = email triple = create_train_triple(taggedfile) docs.append(triple) print("class: " + str(triple[1]) + "\n") classes.add(str(triple[1])) print(("\n\n information extracted from " + fileName)) print("Topic:") print(Ontology.get_topic(text)) #test sentence tagging print("tagged sentences + paragraphs:") #untagged = Tagger.remove_tags(body) paratagged = Tagger.output_tagged_para(body) senttagged = Tagger.output_tagged_sents(paratagged) #print(senttagged) #print(body in text) mytext = text.split("Abstract:")[0] + "Abstract:" + senttagged #evaluate acparas = Tagger.extract_paragraphs(taggedtext) myparas = Tagger.extract_paragraphs(mytext) acsents = Tagger.extract_sentences(taggedtext) mysents = Tagger.extract_sentences(mytext) #calculate scores and tally total (acc, prec, rec, f) = Eval.evaluate_generic(fileName, myparas, acparas) totPacc, totPprec, totPrec, totPf = (totPacc + acc, totPprec + prec, totPrec + rec, totPf + f) (acc, prec, rec, f) = Eval.evaluate_generic(fileName, mysents, acsents) totSacc, totSprec, totSrec, totSf = (totSacc + acc, totSprec + prec, totSrec + rec, totSf + f) #Time tagging print("Times found:") (stimes, etimes) = Tagger.output_tagged_time(mytext) mytext = Tagger.find_and_tag(set(stimes), "stime", mytext) mytext = Tagger.find_and_tag(set(etimes), "etime", mytext) acstimes = Tagger.extract_stimes(taggedtext) acetimes = Tagger.extract_etimes(taggedtext) #eval times (acc, prec, rec, f) = Eval.evaluate_generic(fileName, stimes, acstimes) totTacc, totTprec, totTrec, totTf = (totTacc + acc, totTprec + prec, totTrec + rec, totTf + f) (acc, prec, rec, f) = Eval.evaluate_generic(fileName, etimes, acetimes) totTacc, totTprec, totTrec, totTf = (totTacc + acc, totTprec + prec, totTrec + rec, totTf + f) names = namesdict[fileName] print("people") print(names) print("ACSPEAKERS: ") acspeakers = NER.extract_tagged_names(taggedtext) print(acspeakers) if names != []: collapsednames = NER.collapse_names(names) #nameset = set(collapsednames) print(collapsednames) collapsednames = NER.filter_sender(collapsednames, text) speakers = [] if collapsednames != {}: speakerdict = NER.pick_speakers(collapsednames, text) for num in speakerdict: speakers.extend(collapsednames[num]) print("SPEAKERS:") print(speakers) notspeakers = [] for name in names: if not name in speakers: notspeakers.append(name) #concat not speakers into list for evaluation (accuracy, precision, recall, f1) = Eval.evaluate_speakers(taggedfile, speakers, notspeakers, acspeakers) else: speakers = [] notspeakers = [] (accuracy, precision, recall, f1) = Eval.evaluate_speakers(taggedfile, speakers, notspeakers, acspeakers) #x = input() #tag mytext = Tagger.find_and_tag(speakers, "speaker", mytext) #eval names totNacc, totNprec, totNrec, totNf = (totNacc + acc, totNprec + prec, totNrec + rec, totNf + f) #Location Tagging locations = Locations.get_all_locations(text) selectedlocs = Locations.pick_locations(locations, text) print("Selected locations:" ) for loc in selectedlocs: print(" " + loc) mytext = Tagger.find_and_tag(selectedlocs, "location", mytext) print("ACLOCS:") aclocs = Locations.extract_tagged_locations(taggedtext) for acloc in aclocs: print(" " + acloc) #notlocs notlocs = [] for loc in locations: if not loc in selectedlocs: notlocs.append(loc) (accuracy, precision, recall, f1) = Eval.evaluate_locations(taggedfile, selectedlocs, notlocs, aclocs) if precision > 1: raw_input() totLacc = totLacc + accuracy totLprec = totLprec + precision totLrec = totLrec + recall totLf =totLf + f1 print("Topic:") mytext = Tagger.add_ontology_tag(mytext) print("final text:" ) print(mytext) print("writing to " + myList[i]) file = open(myList[i], "w") file.write(mytext) file.close() if stopping: if (i != (len(fileList) -1)): print("press enter for next email:") else: print("press enter to finish") x = raw_input() print("END:") print("Paragraph scores:") print("accuracy: " + str(totPacc / (len(fileList)))) print("precision: " + str(totPprec / (len(fileList))) ) print("recall: " + str(totPrec / (len(fileList)))) print("f1: " + str(totPf / (len(fileList))) + "\n") print("Sentence scores:") print("accuracy: " + str(totSacc / (len(fileList)))) print("precision: " + str(totSprec / (len(fileList))) ) print("recall: " + str(totSrec / (len(fileList)))) print("f1: " + str(totSf / (len(fileList))) + "\n") print("Time scores:") print("accuracy: " + str(totTacc / (2*len(fileList)))) print("precision: " + str(totTprec / (2*len(fileList))) ) print("recall: " + str(totTrec / (2*len(fileList)))) print("f1: " + str(totTf / (2*len(fileList))) + "\n") print("NER scores:") print("accuracy: " + str(totNacc / len(fileList))) print("precision: " + str(totNprec / len(fileList)) ) print("recall: " + str(totNrec / len(fileList))) print("f1: " + str(totNf / len(fileList)) + "\n") print("Location scores:") print("accuracy: " + str(totLacc / len(fileList))) print("precision: " + str(totLprec / len(fileList)) ) print("recall: " + str(totLrec / len(fileList))) print("f1: " + str(totLf / len(fileList)) + "\n") print("=========================================") print(" Overall:") print("accuracy: " + str((totLacc+totPacc+totSacc+totTacc+totNacc) / (6*len(fileList)))) print("precision: " + str((totLprec+totPprec+totSprec+totTprec+totNprec) / (6*len(fileList))) ) print("recall: " + str((totLrec+totPrec+totSrec+totTrec+totNrec) / (6*len(fileList)))) print("f1: " + str((totLf+totPf+totSf+totTf+totNf) / (6*len(fileList))) + "\n")
master_person_list=[] master_org_list=[] master_loc_list=[] master_month_list=[] master_time_list=[] master_money_list=[] master_percent_list=[] master_prof_list=[] for i in range(0, len(sentences_list)): sentences_list[i]=sentences_list[i].replace(',','').replace('.','').replace('!','') #sentences_list[i]=sentences_list[i].replace("'",'"') sent_person_list,sent_org_list,sent_loc_list,sent_month_list,sent_time_list,sent_money_list,sent_percent_list,sent_prof_list=NER.named_entity_recognition(sentences_list[i]) master_person_list.append(sent_person_list) master_org_list.append(sent_org_list) master_loc_list.append(sent_loc_list) master_month_list.append(sent_month_list) #month and weekday names + season names master_time_list.append(sent_time_list) master_money_list.append(sent_money_list) master_percent_list.append(sent_percent_list) master_prof_list.append(sent_prof_list) ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW ######################## who_list,what_list,when_list,why_list,where_list,how_list=[],[],[],[],[],[]
#print ("Files to be processed are: ") #print (fileList) print ("----------------------------------------") docs =[] classes = set() #because NER requires training a tagger it is best done as a #batch process rather than repeatedly calling a function, #therefore it is done here and the results processed later if (len(fileList) > 10): print "running NER on " + str(len(fileList)) + " files (might take a bit)" else: print "running NER" namesdict = NER.extract_names_files(fileList) numdict = {} for i in range (0, len(fileList)): untaggedfile = fileList[i] taggedfile = fileList[i].replace("un", "") text = get_text(untaggedfile) taggedtext = get_text(taggedfile) acnames = NER.collapse_names(NER.extract_tagged_names(taggedtext)) nameset = NER.collapse_names(namesdict[fileList[i]]) numnames = len(nameset) if numnames in numdict:
__author__ = 'Anirudh' import NER sentence_list=['This is January','She flew in December','In March he got a $50 ticket -- and decided to take it to court'] for i in sentence_list: NER.named_entity_recognition(i)
def fixCols(fin, fout): fin = os.path.abspath(fin) fout = os.path.abspath(fout) text = NER.getText(fin) txt = re.sub(r"(\nO)",'\n.\tO',text) NER.writeText(fout, txt)
#! /usr/bin/env python import NER import sys import os if __name__=="__main__": fin = os.path.abspath(sys.argv[1]) fout = os.path.abspath(sys.argv[2]) NER.utf8tobw(fin,fout)
#print ("Files to be processed are: ") #print (fileList) print ("----------------------------------------") docs =[] classes = set() #because NER requires training a tagger it is best done as a #batch process rather than repeatedly calling a function, #therefore it is done here and the results processed later if (len(fileList) > 10): print "running NER on " + str(len(fileList)) + " files (might take a bit)" else: print "running NER" namesdict = NER.extract_names_files(fileList) numdict = {} for i in range (0, len(fileList)): untaggedfile = fileList[i] text = get_text(untaggedfile) nameset = NER.collapse_names(namesdict[fileList[i]]) numnames = len(nameset) if numnames in numdict: numdict[numnames] = numdict[numnames] + 1
master_percent_list=[] master_prof_list=[] #print 'Sentence list is:',sentences_list for i in range(0, len(sentences_list)): temp_str=sentences_list[i] '''sentences_list[i]=sentences_list[i].strip() if sentences_list[i].index(',') != -1: if sentences_list[i][sentences_list[i].index(',')+1]!=' ': sentences_list[i]=sentences_list[i].replace(',','').replace('!','')''' temp_str=temp_str.strip() temp_str=temp_str.replace(',','').replace('!','') #sentences_list[i]=sentences_list[i].replace("'",'"') sent_person_list,sent_org_list,sent_loc_list,sent_month_list,sent_time_list,sent_money_list,sent_percent_list,sent_prof_list=NER.named_entity_recognition(temp_str) master_person_list.append(sent_person_list) master_org_list.append(sent_org_list) master_loc_list.append(sent_loc_list) master_month_list.append(sent_month_list) #month and weekday names + season names master_time_list.append(sent_time_list) master_money_list.append(sent_money_list) master_percent_list.append(sent_percent_list) master_prof_list.append(sent_prof_list) ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW ######################## who_list,what_list,when_list,why_list,where_list,how_list=[],[],[],[],[],[]
def collect_articles(): """ Collects articles from Reuters and writes to database. """ NER.update(news_loader.for_ner()) NER.write_database()
def trainFinal(multi=True, plus2016=False, pluslexica=False, pluswiki=False, plusgmb=False, plusdev=False, plusconll2003=False, doInternal=True, doExternal=True, lowercase=False, externalPOS=False, outhandle=""): """train all of the given training data and write it to disk for future use""" POS_train = [] # load the 2017 data training_2017 = '../data/2017/wnut17train.conll' with open(training_2017, 'r') as f2: train = re.split("\n[\t]?\n", f2.read().strip()) if externalPOS: for record in train: if record: #avoid empty strings data = [ re.split('\t', d) for d in re.split("\n", record) if len(re.split("\t", d)) == 2 ] tokens, tags = zip(*data) POSs = [t[1] for t in tagger.tag(tokens)] POS_train.append("\n".join([ "\t".join([POS, tag]) for POS, tag in zip(POSs, tags) ])) if plusconll2003: conll = "conll2003_toks.conll" with open(conll, 'r') as f: data_conll = re.split("\n[\t]?\n", f.read().strip()) train.extend(data_conll) if externalPOS: conll = "conll2003_POSs.conll" with open(conll, 'r') as f: data_conll = re.split("\n[\t]?\n", f.read().strip()) POS_train.extend(data_conll) if plusdev: dev = "../data/emerging.dev.conll" with open(dev, 'r') as f: data_dev = re.split("\n[\t]?\n", f.read().strip()) train.extend(data_dev) if externalPOS: dev = "dev_POSs.conll" with open(dev, 'r') as f: data_dev = re.split("\n[\t]?\n", f.read().strip()) POS_train.extend(data_dev) if pluswiki: wiki = "wiki_toks_amazing.conll" with open(wiki, 'r') as f: data_wiki = re.split("\n[\t]?\n", f.read().strip()) train.extend(data_wiki) if externalPOS: wiki = "wiki_POSs.conll" with open(wiki, 'r') as f: data_wiki = re.split("\n[\t]?\n", f.read().strip()) POS_train.extend(data_wiki) if plusgmb: gmb = "gmb_toks.conll" with open(gmb, 'r') as f: data_gmb = re.split("\n[\t]?\n", f.read().strip()) train.extend(data_wiki) if externalPOS: gmb = "gmb_POSs.conll" with open(gmb, 'r') as f: data_gmb = re.split("\n[\t]?\n", f.read().strip()) POS_train.extend(data_gmb) # load the 2016 data if plus2016: training_2016 = '../data/2016/data/train' dev_2016 = '../data/2016/data/dev' test_2016 = '../data/2016/data/test' with open(training_2016, 'r') as f: data_2016 = re.split("\n[\t]?\n", f.read().strip()) with open(dev_2016, 'r') as f2: data_2016_dev = re.split("\n[\t]?\n", f2.read().strip()) with open(test_2016, 'r') as f3: data_2016_test = re.split("\n[\t]?\n", f3.read().strip()) train2 = data_2016 + data_2016_dev + data_2016_test train.extend(train2) if externalPOS: for record in train2: if record: #avoid empty strings data = [ re.split('\t', d) for d in re.split("\n", record) if len(re.split("\t", d)) == 2 ] tokens, tags = zip(*data) POSs = [t[1] for t in tagger.tag(tokens)] POS_train.append("\n".join([ "\t".join([POS, tag]) for POS, tag in zip(POSs, tags) ])) # load and weight the lexical data if pluslexica: numtoks = 0 weights = [] cts = Counter() NEcts = defaultdict(Counter) numNEs = Counter() totalNEs = 0 for record in train: if record: #avoid empty strings data = [ re.split('\t', d) for d in re.split("\n", record) if len(re.split("\t", d)) == 2 ] tokens, tags = zip(*data) if lowercase: tokens = [token.lower() for token in tokens] for i, token in enumerate(tokens): if tags[i] == "O": cts[token] += 1 else: NEcts[tags[i]][token] += 1 if tags[i][0] == "B": totalNEs += 1 numNEs[tags[i]] += 1 weights.append([1] * len(tokens)) lexica = { ## "architecture.museum": "location", ## "automotive.make": "corporation", "automotive.model": "product", ## "broadcast.tv_channel": "corporation", ## "business.consumer_company": "corporation", "business.consumer_product": "product", ## "cvg.computer_videogame": "creative-work", ## "cvg.cvg_developer": "corporation", "cvg.cvg_platform": "product", "firstname.5k": "person", # note to Capcase this lexicon "lastname.5000": "person", # note to Capcase with B- and I-tags ## "location": "location", "location.country": "location", "people.family_name": "person", # note to Capcase with B- and I-tags "people.person.filtered": "person", ## "sports.sports_team": "group", ## "tv.tv_network": "corporation", ## "tv.tv_program": "creative-work", } lexNEs = Counter() lexrec = [] lexweights = [] lexterms = set() for lexicon in lexica: NEtype = lexica[lexicon] with open("/data/WNUT-NER-2017/data/lexicon/" + lexicon) as f: for line in f: if lowercase: line = line.strip().lower() else: line = line.strip() tokens = [ s for s in re.split("([ \.\,])", line) if s != " " and s ] if tokens: lexNEs["B-" + NEtype] += 1 if not lowercase: if (lexicon == "firstname.5k" or lexicon == "lastname.5000" or lexicon == "people.family_name"): tokens = [ s.capitalize() for s in re.split("([ \.\,])", line) if s != " " and s ] tags = ["I-" + NEtype for t in tokens] tags[0] = "B-" + NEtype for token, tag in zip(tokens, tags): weight = 1 lexterms.add((token, tag, weight)) if (lexicon == "lastname.5000" or lexicon == "people.family_name"): weight = 1 lexterms.add((tokens[0], "I-" + NEtype, weight)) for lexterm in lexterms: NEtype = "B" + lexterm[1][1:] lexweights.append([numNEs[NEtype] / lexNEs[NEtype]]) record = "\t".join([lexterm[0], lexterm[1]]) lexrec.append(record) for token in cts.most_common(): lexweights.append([cts[token[0]]]) record = "\t".join([token[0], "O"]) lexrec.append(record) for NEtype in NEcts: if not lexNEs["B-" + NEtype[2:]]: for token in NEcts[NEtype].most_common(): lexweights.append([NEcts[NEtype][token[0]]]) record = "\t".join([token[0], NEtype]) lexrec.append(record) """ #now load in the already trained extra data with open('extracon.txt', 'rb') as infile: con_counts = pickle.load(infile) with open('extradef.txt', 'rb') as f: deflike = pickle.load(f) """ exdeflike = {} if doExternal: if externalPOS: exdeflike = NER.trainExternal(POS_train, multi) else: exdeflike = NER.trainExternal(train, multi) indeflike = {} if doInternal: if pluslexica: train.extend(lexrec) weights.extend(lexweights) indeflike = NER.trainInternal(train, weights=weights, lowercase=lowercase) else: indeflike = NER.trainInternal(train, lowercase=lowercase) #now write these dicts to disk exout = 'finalexdef_entitytype.pickle' inout = 'finalindef_entitytype.pickle' if externalPOS: exout = re.sub("_entitytype", "_externalPOS_entitytype", exout) if outhandle: exout = re.sub("_entitytype", "_" + outhandle + "_entitytype", exout) inout = re.sub("_entitytype", "_" + outhandle + "_entitytype", inout) if plusdev: exout = re.sub("_entitytype", "_plusdev_entitytype", exout) inout = re.sub("_entitytype", "_plusdev_entitytype", inout) if plusconll2003: exout = re.sub("_entitytype", "_plusconll2003_entitytype", exout) inout = re.sub("_entitytype", "_plusconll2003_entitytype", inout) if plus2016: exout = re.sub("_entitytype", "_plus2016_entitytype", exout) inout = re.sub("_entitytype", "_plus2016_entitytype", inout) if pluslexica: exout = re.sub("_entitytype", "_pluslexica_entitytype", exout) inout = re.sub("_entitytype", "_pluslexica_entitytype", inout) if pluswiki: exout = re.sub("_entitytype", "_pluswiki_entitytype", exout) inout = re.sub("_entitytype", "_pluswiki_entitytype", inout) if plusgmb: exout = re.sub("_entitytype", "_plusgmb_entitytype", exout) inout = re.sub("_entitytype", "_plusgmb_entitytype", inout) if lowercase: exout = re.sub("_entitytype", "_lowercase_entitytype", exout) inout = re.sub("_entitytype", "_lowercase_entitytype", inout) if doExternal: with open(exout, 'wb') as outfile: pickle.dump(exdeflike, outfile) if doInternal: with open(inout, 'wb') as outfile2: pickle.dump(indeflike, outfile2)
def crossTrain(n=10, multi=True): """to maximize efficiency this will create _exdeflike_ and _indeflike_ once and write them to JSON files so we can just call them later without having to train the same thing over and over""" training_2016 = '../data/2016/data/train' dev_2016 = '../data/2016/data/dev' test_2016 = '../data/2016/data/test' training_2017 = '../data/2017/wnut17train.conll' data_split = split_data(training_2017, n) total_indef = defaultdict(defaultdict) total_exdef = defaultdict(defaultdict) #we'll use the 2016 data in addition to do the training # with open(training_2016, 'r') as f: # data_2016 = re.split("\n[\t]?\n", f.read().strip()) # with open(dev_2016, 'r') as f1: # dev_2016 = re.split("\n[\t]?\n", f1.read().strip()) # with open(test_2016, 'r') as f2: # test_2016 = re.split("\n[\t]?\n", f2.read().strip()) # #add these up # train2 = data_2016 + dev_2016 + test_2016 """ print "extra con" with open('extracon.txt', 'rb') as infile: con_counts = pickle.load(infile) print "extra def" with open('extradef.txt', 'rb') as f: deflike = pickle.load(f) """ for i, data in enumerate(data_split): #need to perform testing n times giving each partition of the data #a chance to be the test data and the rest training #con_counts_copy = deepcopy(con_counts) #deflike_copy = deepcopy(deflike) copy_datasplit = list(data_split) test = copy_datasplit.pop(i) #now flatten the list of n-1 lists to a single list train = [item for sublist in copy_datasplit for item in sublist] #now add the 2016 data to the train list, but don't add records #that already exist # for record in train2: # if record not in train: # train.append(record) exdeflike = NER.trainExternal(train, multi) indeflike = NER.trainInternal(train) total_exdef[i] = exdeflike total_indef[i] = indeflike #write these dicts to the disk with open('exdeflike.txt', 'wb') as outfile: pickle.dump(total_exdef, outfile) with open('indeflike.txt', 'wb') as f: pickle.dump(total_indef, f)
def final_analysis( exdeflikefile, indeflikefile, multi=True, lowercase=True, externalPOS=True, dev=True, thresholds={ "location": 0.292, "group": 0.09, "product": 0.131, "creative-work": 1.1, "person": 0.202, "corporation": 1.1 }): #load exdef and indef with open(exdeflikefile) as f: exdeflike = pickle.load(f) with open(indeflikefile) as f: indeflike = pickle.load(f) """train all of the given training data and then test it on the supplied test records. make predictions for NE for each token, then print them out in the format required""" """ #load exdef and indef with open('finalexdef.pickle', 'rb') as infile: exdeflike = pickle.load(infile) with open('finalindef.pickle', 'rb') as infile2: indeflike = pickle.load(infile2) """ #load the test data if dev: test_file = '../data/emerging.dev.conll' outfilename = "../data/finalpredictions/emerging_" + "_".join( re.split("_", indeflikefile)[1:3]) + ".dev" else: test_file = '../data/emerging.test' outfilename = "../data/finalpredictions/emerging_" + "_".join( re.split("_", indeflikefile)[1:3]) + ".test" with open(test_file, 'r') as f3: records = re.split("\n[\t]?\n", f3.read().strip()) #analyze the test data #on the training data, using n-fold validation # f = open(test_file + ".prediction", 'w') f = open(outfilename, "w") # thresholds = { # ## "location": [0.001, 0.157], # "location": [1.1, 0.157], # ## "group": [0.008, 0.199], # "group": [1.1, 0.199], # "product": [1.1, 0.215], # "creative-work": [1.1,0.499], # ## "person": [0.002, 0.167], # "person": [1.1, 0.167], # "corporation": [1.1, 0.218] # } for record in records: if record: #avoid empty strings if dev: data = [ re.split('\t', d) for d in re.split("\n", record) if len(re.split("\t", d)) == 2 ] tokens, tags = zip(*data) else: tokens = [ re.split('\t', d)[0] for d in re.split("\n", record) if len(re.split("\t", d)) == 1 ] uppertokens = list(tokens) if lowercase: tokens = [token.lower() for token in tokens] #keep track of the NE assignments for each token with tuples if lowercase: assignments = [[token, 'O'] for token in uppertokens if token] else: assignments = [[token, 'O'] for token in tokens if token] ## predictions = {} for NEtype in thresholds: exavedeflike, inavedeflike = NER.test(tokens, exdeflike, indeflike, multi, NEtype, externalPOS=externalPOS, uppertokens=uppertokens) #find the NEs using the _LFD_ function as before for indices in NER.LFD(tokens, exavedeflike, inavedeflike, [1.1, thresholds[NEtype]]): # if exavedeflike[indices] >= thresholds[NEtype][0]: # print NEtype+": ", [tokens[ix] for ix in indices] # print "external", exavedeflike[indices] innums = [ inavedeflike[ix][1] for ix in indices if ix != indices[0] ] innums.append(inavedeflike[indices[0]][0]) ## print "internal", NER.harmonic_mean(innums) predictions[(indices, NEtype)] = [ len(list(indices)), exavedeflike[indices], NER.harmonic_mean(innums) ] ## for indices, NEtype in predictions: thissize = predictions[(indices, NEtype)][0] thislike = predictions[(indices, NEtype)][2] for otherindices, otherNEtype in predictions: thatsize = predictions[(otherindices, otherNEtype)][0] thatlike = predictions[(otherindices, otherNEtype)][2] broken = True for ix in otherindices: if ix in indices: if otherindices[0] < indices[0]: print("precidence, avoided: ", [tokens[ix] for ix in indices], " over ", [tokens[ix] for ix in otherindices]) break elif otherindices[0] == indices[0]: if thatsize > thissize: print("size, avoided: ", [tokens[ix] for ix in indices], " over ", [tokens[ix] for ix in otherindices]) break elif thatlike > thislike: print("likelihood, avoided: " + NEtype, [tokens[ix] for ix in indices ], " over " + otherNEtype, [tokens[ix] for ix in otherindices]) break else: broken = False if broken: break else: print NEtype + ": ", [tokens[ix] for ix in indices] print "internal", thislike ## #assign 'B' to the first, 'I' to the rest n = 0 for index in indices: if n == 0: assignments[index][1] = 'B-' + NEtype else: assignments[index][1] = 'I-' + NEtype n += 1 #keep track of position in NE ## for i, assignment in enumerate(assignments): if dev: f.writelines( "\t".join([assignment[0], tags[i], assignment[1]]) + "\n") else: f.writelines("\t".join([assignment[0], assignment[1]]) + "\n") f.writelines("\n")
import NER, pickle, nltk import itertools corpus_root = "/Users/funktor/Downloads/gmb-2.2.0" sentences = NER.read_gmb(corpus_root) train_sents = itertools.islice(sentences, 50000) test_sents = itertools.islice(sentences, 5000) crf_model = NER.trainCRF(train_sents) pickle.dump(crf_model, open('crf_model.sav', 'wb')) crf_model = pickle.load(open('crf_model.sav', 'rb')) str = "Christian Bale acted as the Batman and Heath Ledger as the Joker in the movie The Dark Knight" print NER.predictNERSentence(str, crf_model) print NER.testCRF(crf_model, test_sents) tags = [ 'O', 'B-per', 'I-per', 'B-gpe', 'I-gpe', 'B-geo', 'I-geo', 'B-org', 'I-org', 'B-tim', 'I-tim', 'B-art', 'I-art', 'B-eve', 'I-eve', 'B-nat', 'I-nat' ] clf = NER.trainOnline(train_sents, tags, batch_size=500) NER.testOnline(clf, test_sents)
def final_scan(exdeflike, indeflike, multi=True, lowercase=False, externalPOS=False, outkey="default"): """train all of the given training data and then test it on the supplied test records. make predictions for NE for each token, then print them out in the format required""" """ #load exdef and indef with open('finalexdef.pickle', 'rb') as infile: exdeflike = pickle.load(infile) with open('finalindef.pickle', 'rb') as infile2: indeflike = pickle.load(infile2) """ #load the test data test_file = '../data/emerging.dev.conll' with open(test_file, 'r') as f3: records = re.split("\n[\t]?\n", f3.read().strip()) numrecs = len(records) os.system("mkdir -p ../data/predictions/" + outkey) #analyze the test data # threshold = [0.138, 0.13] #this is the threshold we found to give the best F1 score #on the training data, using n-fold validation tstarts = { "location": 0.5, "group": 0.5, "product": 0.5, "creative-work": 0.5, "person": 0.5, "corporation": 0.5 } tdiffs = range(-4, 5) # [d for d in range(-49,50)] ## diffs = [-0.49--0.49] allresults = defaultdict(list) ## begin rounds loop here for rnd in range(1, 4): ##range(1,2): ## print "working on rnd " + str(rnd) print "here are the starting thresholds: " for NEtype in tstarts: print NEtype, tstarts[NEtype] print results = defaultdict(list) tdiffs = [tdiff / 10 for tdiff in tdiffs] # [tdiff/100 for tdiff in tdiffs] [-0.0049--0.0049] NEthreshs = { NEtype: [tstarts[NEtype] + tdiff for tdiff in tdiffs] # [t/1000 for t in range(1001)] for NEtype in tstarts } fs = {} for NEtype in NEthreshs: for t in NEthreshs[NEtype]: fkey = str(t) + "-" + NEtype threshfile = re.sub("/data/", "/data/predictions/" + outkey + "/", test_file + "-" + fkey + ".prediction") fs[fkey] = [open(threshfile, 'w'), threshfile] fs[fkey][0].close() numdone = 0 for record in records: print str( 100 * numdone / numrecs) + "% done with round " + str(rnd) numdone += 1 if record: #avoid empty strings data = [ re.split('\t', d) for d in re.split("\n", record) if len(re.split("\t", d)) == 2 ] tokens, tags = zip(*data) uppertokens = list(tokens) if lowercase: tokens = [token.lower() for token in tokens] for NEtype in NEthreshs: exavedeflike, inavedeflike = NER.test( tokens, exdeflike, indeflike, multi, NEtype, externalPOS=externalPOS, uppertokens=uppertokens) for t in NEthreshs[NEtype]: #keep track of the NE assignments #for each token with tuples if lowercase: assignments = [[token, 'O'] for token in uppertokens if token] else: assignments = [[token, 'O'] for token in tokens if token] fkey = str(t) + "-" + NEtype #find the NEs using the _LFD_ function as before for indices in NER.LFD(tokens, exavedeflike, inavedeflike, [1.1, t]): # print t, [tokens[ix] for ix in indices] # innums = [inavedeflike[ix][1] for ix in indices if ix != indices[0]] # innums.append(inavedeflike[indices[0]][0]) # print "internal", NER.harmonic_mean(innums) # raw_input() n = 0 for index in indices: if n == 0: assignments[index][1] = 'B-' + NEtype else: assignments[index][1] = 'I-' + NEtype n += 1 #keep track of position in NE ## write out according to file handles, here fs[fkey][0] = open(fs[fkey][1], "a") for i, assignment in enumerate(assignments): fs[fkey][0].writelines("\t".join( [assignment[0], tags[i], assignment[1]]) + "\n") fs[fkey][0].writelines("\n") fs[fkey][0].close() ## evaluate all thresholds and all NE types for the best of the round for fkey in fs: ## fs[fkey][0].close() NEtype = "-".join(re.split("-", fkey)[1:]) t = float(re.split("-", fkey)[0]) filename = fs[fkey][1] try: results[NEtype].append((map( float, re.split("\;", [ re.sub("[^0-9\.\;]+", "", re.sub("\d+$|\d\:", "", r)) for r in re.split( "\n", subprocess.check_output( "python2 ../data/wnuteval.py " + filename, shell=True)) if re.search(NEtype, r) ][0])), t)) except: results[NEtype].append(([0., 0., 0.], t)) allresults[NEtype].append(tuple(results[NEtype][-1])) ## store the best of this round as tstarts for NEtype in results: tstarts[NEtype] = max(results[NEtype], key=lambda x: x[0][2])[1] print "here are the end-of-round thresholds: " for NEtype in tstarts: print NEtype, tstarts[NEtype] print with open("../data/predictions/" + outkey + "/allresults.json", "w") as f: f.writelines(json.dumps([tstarts, allresults])) return tstarts, allresults
try: json.dump(struct, fh,indent=0) fh.close() shutil.copy(tmpFile.name, fName) except: print >> sys.stderr, "Failed to write the structure in", fName print_exception() tmpFile.close() def loadJson(fName): try: fh = open(fName,'rb') result = json.load(fh) fh.close() return result except: print >> sys.stderr, fName, "Failed to be interpreted as a Json file because of" print_exception() fh.close() return -1 if __name__== "__main__": fName = os.path.abspath(sys.argv[1]) b2utf8(fName, fName+".utf8") NER.utf8tobw(fName+".utf8", fName+".BW") b2utf8(fName+".BW", fName+".BW.utf8") fixCols(fName+".BW.utf8")
def process_email(text): email = (ProcessedEmail(text)) Tagger.output_tagged_para(email.body) print ("===========================================") NER.print_names_text(text)
def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_person_list): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec'] abbreviation_list=[('Mt.','Mount')] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q=temp_q.replace(k,abbreviation_list[0][1]) #print 'Question is :',temp_q q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q) lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist for i in range(0,len(complete_sentence_list)): score=0 #print complete_sentence_list[i] # 1. Word Match scoring function for each of the sentences score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #print 'Score after wordmatch is :',score #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue for k in temp_q.split(): if k.lower() in what_month: if sent_time_list[i] != []: score=score + 4 #print 'Score after Rule 2 is :',score # 3. What "kind" questions. Sentences containing "call" or "from" elif k.lower() =='kind': for m in complete_sentence_list[i].split(): if lmtzr.lemmatize(m,'v') in ['call','from']: score=score+6 #print 'Score after Rule 3 is :',score # 4. If question contains "name" and the sentence contains {name,call,known} elif k.lower() =='name': for m in complete_sentence_list[i].split(): if lmtzr.lemmatize(m,'v') in ['name','call','known']: score=score+20 #print 'Score after Rule 4 is :',score '''if q_person_list !=[]: if sent_person_list[i] !=[]: score=score+6''' #print 'Score after Rule 4 is :',score #5. If question contains name + PP and contains(S,ProperNoun) and Head PP '''if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if person_list != []: #TODO Check if it also contains (proper_noun,head(PP)) score=score +20''' # 6. Reward sentences which has the verb appearing in the question in its sentence sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i]) for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 # 7. Definition type questions or what is X or what are X questions ? temp_list=temp_q.split() if len(temp_list) <= 6: if '(' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('(') + 1 end_index=complete_sentence_list[i].index(')') score=score+20 return complete_sentence_list[i][start_index:end_index] elif '--' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('--') + 1 end_index=complete_sentence_list[i].index('--') score=score+20 return complete_sentence_list[i][start_index:end_index] elif '{' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('{') + 1 end_index=complete_sentence_list[i].index('}') score=score+20 return complete_sentence_list[i][start_index:end_index] # If the question contains "sport" related terms, answer should also have sport related terms '''if temp[j].lower() in ['sports','games','olympics']: temp2=sentence_list[i].split() for k in range(0,len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']: score=score+6''' # If the sentence contains a "country" name and the sentence contains a LOCATION, then it is confident score '''if temp[j].lower() in ['country','countries','olympics']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if loc_list != []: score=score + 6*len(loc_list)''' # Confidence score increases with increasing number of countries appearing in the sentence. sent_score_list.append(score) #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value =max(sent_score_list) # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list answer_list=[] if len(final_sent_list) == 1: temp= final_sent_list[0].split() '''for k in range(0,len(temp)): if temp[k].lower() =='to': return ' '.join(temp[k:])''' #print temp_q.split() for k in range(0,len(temp)): if k !=0 or k!=len(temp)-1: if temp[k].lower()=='per' and temp[k+1].lower()=='cent': return ' '.join(temp[k-1:k+2]) if temp[k] not in temp_q.split(): #print temp[k] answer_list.append(temp[k]) return ' '.join(answer_list) else: for i in range(0,len(final_sent_list)): result=final_sent_list[i] break temp= result.split() '''for k in range(0,len(temp)): if temp[k].lower() =='to': return ' '.join(temp[k:]) else: temp=result''' for k in range(0, len(temp)): if temp[k] not in temp_q.split(): answer_list.append(temp[k]) return ' '.join(answer_list)
# -*- coding: utf-8 -*- import OCR import NER from flask import Flask, request import os import json # os.environ["CUDA_VISIBLE_DEVICES"] = "1" OCR_test = OCR.testOCR() # 載入OCR模組 NER_test = NER.testNER() # 載入NER模組 # Flask應用來源:https://www.cnblogs.com/lsdb/p/10488448.html app = Flask(__name__) basedir = os.path.abspath(os.path.dirname(__file__)) # route()方法用于设定路由;类似spring路由配置 #等价于在方法后写:app.add_url_rule('/', 'helloworld', hello_world) @app.route('/') def hello_world(): return 'Hello, World!' # Python flask.request raw應用來源:http://codingdict.com/sources/py/flask.request/4366.html @app.route('/cht/up_photo', methods=['post']) def up_photo_cht(): a = request.get_data() dict1 = json.loads(a) base64_data = dict1["photo"]