def parseIntoDictionary(xml, clean=True): # dictionary of word to tuple # dict : word -> ('part-of-sp', subdict) # where subdict : senseid -> ('def', ['exampleslist'], [wordnet ints]) dictionary = dict() root = xml.getroot() # <dictmap> for lexelt in root: # looks like there are exactly 100 #lexelt.attrib['item'] is something like 'begin.v' or 'network.n' word,pos = tuple(lexelt.attrib['item'].split('.')) senses = dict() for sense in lexelt: # accumulate senses dictionary for word attr = sense.attrib sid = int(attr['id']) # 1, 2, 3, etc # wordnet is a list of ints. handle the empty case: try: wordnet = map(int, attr['wordnet'].split(',')) # [2,3,4] except: wordnet = [] gloss = attr['gloss'].strip() # definition examples = attr['examples'].split(' | ') # ['hi','world'] if clean: # then gloss and each example get cleaned! gloss = utilities.cleanString(gloss) for i in range(len(examples)): examples[i] = utilities.cleanString(examples[i]) senses[sid] = (gloss, examples, wordnet) dictionary[word] = (pos, senses) return dictionary
def removeTagFromEvent(cls, eventKey, userKey, tagName): #clean tag name tagName = utilities.cleanString(tagName) #Use this to check if the tag exists on that specific event that is needed userEventOb = cls.getUserEventForEventFromTagName(userKey, tagName, eventKey) if not userEventOb: return False tagOb = cls.getTagObjectFromString(userKey, tagName) #no tag by tagName if not tagOb: return False tagKey = tagOb.key.urlsafe() #checks to see if there are other events from same tag eventKeyList = user_event.UserEvent.getAllEventsFromTagOb(userKey, tagKey) logging.info(len(eventKeyList)) #if no other events, remove tag if it isn't permanent if(len(eventKeyList) == 1 and (tagOb.permanent==False)): tagOb.key.delete() if not user_event.UserEvent.removeTagObFromEvent(eventKey, userKey, tagKey): return False return True
def addTagToEvent(cls, eventKey, userKey, tagName): #clean tag name tagName = utilities.cleanString(tagName) tagOb = cls.getTagObjectFromString(userKey, tagName) #create new tag if tag doesn't exists tagColor = "Tag Exists" #returns this in the API call if the tag exists so it didnt need to add a new color to the tag if not tagOb: tagColor = choice(Tag.colorArr) tagOb = Tag(parent = ndb.Key(urlsafe=userKey), permanent = False, name=tagName, color = tagColor) tagOb.put() tagKey = tagOb.key.urlsafe() returnArray = [] returnArray.append(user_event.UserEvent.addTagObToEvent(eventKey, userKey, tagKey)) returnArray.append(tagColor) #adds tag to userEvent return returnArray
def Lesk(self, word, pos, pre_words, post_words,softScoring=False,alpha=0.5): scores = dict() # what to do if word doesnt exist? # TODO - need a way to store different pos of a word and retrieve them accordingly list_of_senses = [] if word in self.dict: wordpos, list_of_senses = self.dict[word] else: # we don't know pos, senses, or definition. should not happen from test data lst_tup = dict() i = 1 for synset in wn.synsets(word): # may be empty! # clean definition - check this lst_tup.update({i : (utilities.cleanString(synset.definition), [], [])}) i += 1 self.dict.update({word: (synset.pos, lst_tup)}) list_of_senses = self.dict[word][1] #print list_of_senses for sense in list_of_senses: # sense is int. length list_of_senses is approx 5 #print "Senses:\n", sense, list_of_senses[sense] overlap = self.computeOverlap(word, pos, list_of_senses[sense], pre_words, post_words) #print "overlap for sense", sense, ":", overlap #print "!!!!", sense, "Overlap:", overlap scores[sense] = overlap+alpha #print "Best Sense is: ", best_sense if softScoring: values = [] total = sum(scores.values()) for key in scores: scores[key] = scores[key] / float(total) return scores, alpha
def getUserEventForEventFromTagName(cls, userKey, tagName, eventKey): #clean tag name tagName = utilities.cleanString(tagName) tagOb = cls.getTagObjectFromString(userKey, tagName) #tries to find the existing tag with the tag name for the event userEventOb = user_event.UserEvent.getUserEventWithTagObAndEventKey(userKey = userKey, tagKey = tagOb.key.urlsafe(), eventKey = eventKey) if not userEventOb: return False return userEventOb
def getTagObjectFromString(cls, userKey, tagName): #clean tag name tagName = utilities.cleanString(tagName) tagOb = None #tries finding existing tag tagObList = cls.query(ancestor=ndb.Key(urlsafe=userKey)).filter(cls.name == tagName).fetch() for tagOb in tagObList: tagOb = tagOb #no tag by tagName if tagOb is None: return False return tagOb
def getReviewList(datafile, defaultToZero = False): reviews = [] # collect reviews in a list # review components name,lines = None, list() with open(datafile) as f: for line in f: line = tuple(line.strip().split('\t')) if len(line) == 1 and line[0] != '': # if line has no tab character, start of a new review. name = line[0] elif len(line) == 1 and line[0] == '': # end of review. collect results, append reviews.append((name, lines)) name,lines = None, list() else: # len(line) == 2 # remove stopwords, lemmatize, other cleans... sentiment = strToSentiment(line[0], defaultToZero) sentence = utilities.cleanString(line[1]) lines.append((sentiment, sentence)) return reviews
def getOverlaps(self, def_words, context_word, listwords): context_overlap = 0 # if context_word in def_words overlap = 0 # overlap of contextword def words and def_words consecOverlap = 0 if context_word in def_words: context_overlap += 1 if context_word in self.dict: (pos, subdict) = self.dict[context_word] for sense in subdict: (worddef, examples, wnints) = subdict[sense] lst = worddef.split(' ') consecOverlap += self.consecutiveOverlaps(def_words, lst) for wrd in lst: if wrd.strip() in def_words: overlap += 1 else: # look up in wordnet for synset in wn.synsets(context_word): defin = utilities.cleanString(synset.definition).split(' ') consecOverlap += self.consecutiveOverlaps(def_words, defin) for wrd in defin: if wrd.strip() in def_words: overlap += 1 return context_overlap, overlap, consecOverlap
token_test.loc[token_test.vanResponse == True, 'prob'].values, title="Van Token Flagging", lower_threshold=.4, upper_threshold=.75) pd.DataFrame(token_model.feature_importances_, index=Features) ################################ # Model for whether the response has names in it ################################ # Score tokens for each relevant voter response labeled['names_extract'] = "" threshold = 0.5 for i, row in labeled.iterrows(): if (cleanString(row['voterFinal']) == "" or row['voterFinal'] is None) and \ (cleanString(row['voterPost']) == "" or row['voterPost'] is None): continue finalCandidates, finalFeatures = get_token_features( row['voterFinal'], row['tripleMessage'], van_token_vectorizer, model_token_bow, english_dict, census_dict, census_last_dict, token_counter) postCandidates, postFeatures = get_token_features(row['voterPost'], row['tripleMessage'], van_token_vectorizer, model_token_bow, english_dict, census_dict, census_last_dict, token_counter, is_post_response=True)
def main(args): # Set home directory home = Path(args.home_folder) # Read in data either from flat file or civis if args.use_civis: home = Path("./Projects/NLP/SMS_Annotation/") van = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name) else: van = load_flat_file(home, args.input_data_filename) # Thresholds for manual review and labeling LOWER_BOUND = .4 UPPER_BOUND = .75 print("Loading Models...") pickle_file = Path(home, "Models", "annotation_models.pkl") with open(pickle_file, "rb") as f: # N-Gram Featurizers response_vectorizer = pickle.load(f) final_vectorizer = pickle.load(f) post_vectorizer = pickle.load(f) # Logistic Regressions token_model = pickle.load(f) model_tripler = pickle.load(f) model_name = pickle.load(f) model_opt = pickle.load(f) model_wrongnumber = pickle.load(f) token_counter = pickle.load(f) print("Loading Data...") # US Census Data census = pd.read_csv(Path(home, "Utility_Data", "census_first_names_all.csv")) census_dict = {} for i, row in census.iterrows(): census_dict[row['name']] = np.log(row['census_count']) # Last Name Data census_last = pd.read_csv(Path(home, "Utility_Data", "census_last_names_all.csv")) census_last_dict = {} for i, row in census_last.iterrows(): census_last_dict[row['name']] = np.log(row['census_count']) # US Word Freq Data english = pd.read_csv(Path(home, "Utility_Data", "english.csv")) english_dict = {} for i, row in english.iterrows(): english_dict[row['name']] = row['freq'] # Clean NA values van.loc[van.notetext.isnull(), 'notetext'] = "" van.loc[van.contactname.isnull(), 'contactname'] = "" # Get Extracted Names names_extract = [] manual_review = [] for i, row in van.iterrows(): response = row['notetext'] if (cleanString(response) == ""): names_extract.append("") manual_review.append(False) continue X_tokens_row = pd.DataFrame( get_token_features(response, row['contactname'], english_dict, census_dict, census_last_dict, token_counter) ).values.astype(float) y_pred = token_model.predict_proba(X_tokens_row) doc = get_doc(response) clean_tokens = [normalize_token(t.string) for t in doc] clean_tokens = [t for t in clean_tokens if not t == ""] # Extract any plausible tokens names_extract.append(extract_good_tokens( clean_tokens = clean_tokens, triple_message = row['contactname'], y_pred = y_pred, response = response, threshold = LOWER_BOUND )) # Send to Manual Review if there are any tokens in the unclear range manual_review.append(((y_pred[:,1] > LOWER_BOUND) & (y_pred[:,1] < UPPER_BOUND)).sum() > 0) van['names_extract'] = names_extract van['manual_review'] = manual_review # Get those with confirmed names triplers = van.loc[(van.manual_review == False) & ~(van.names_extract == "")][['vanid', 'names_extract']] review = van.loc[van.manual_review == True][['vanid', 'contactname', 'notetext', 'names_extract']] # Write out annotated files if args.use_civis: export_civis(triplers, args.output_filename.replace(".csv", ""), args.database_name) export_civis(review, args.manual_review_filename.replace(".csv", ""), args.database_name) else: triplers.to_csv(Path(home, "Output_Data", args.output_filename), index = False, encoding = 'latin1') review.to_csv(Path(home, "Output_Data", args.manual_review_filename), index = False, encoding = 'latin1')