def parseIntoDictionary(xml, clean=True):
    # dictionary of word to tuple
    # dict :  word -> ('part-of-sp', subdict)
    # where subdict : senseid -> ('def', ['exampleslist'], [wordnet ints])
    dictionary = dict()
    root = xml.getroot() # <dictmap>
    for lexelt in root: # looks like there are exactly 100
        #lexelt.attrib['item'] is something like 'begin.v' or 'network.n'
        word,pos = tuple(lexelt.attrib['item'].split('.'))
        senses = dict()
        for sense in lexelt: # accumulate senses dictionary for word
            attr = sense.attrib
            sid = int(attr['id']) # 1, 2, 3, etc
            # wordnet is a list of ints. handle the empty case:
            try: wordnet = map(int, attr['wordnet'].split(',')) # [2,3,4]
            except: wordnet = []
            gloss = attr['gloss'].strip() # definition
            examples = attr['examples'].split(' | ') # ['hi','world']
            if clean: # then gloss and each example get cleaned!
                gloss = utilities.cleanString(gloss)
                for i in range(len(examples)):
                	examples[i] = utilities.cleanString(examples[i])
            senses[sid] = (gloss, examples, wordnet)
        dictionary[word] = (pos, senses)
    return dictionary
Exemple #2
0
 def removeTagFromEvent(cls, eventKey, userKey, tagName):
     
     #clean tag name
     tagName = utilities.cleanString(tagName)
     
     #Use this to check if the tag exists on that specific event that is needed
     userEventOb = cls.getUserEventForEventFromTagName(userKey, tagName, eventKey)
     
     if not userEventOb:
         return False
     
     tagOb = cls.getTagObjectFromString(userKey, tagName)
     
     #no tag by tagName
     if not tagOb:
         return False
     
     tagKey = tagOb.key.urlsafe()
     
     #checks to see if there are other events from same tag
     eventKeyList = user_event.UserEvent.getAllEventsFromTagOb(userKey, tagKey)
     
     logging.info(len(eventKeyList))
     
     #if no other events, remove tag if it isn't permanent
     if(len(eventKeyList) == 1 and (tagOb.permanent==False)):
         tagOb.key.delete()
     
     
     if not user_event.UserEvent.removeTagObFromEvent(eventKey, userKey, tagKey):
         return False
     
     return True
Exemple #3
0
 def addTagToEvent(cls, eventKey, userKey, tagName):
     
     #clean tag name
     tagName = utilities.cleanString(tagName)
     
     
     tagOb = cls.getTagObjectFromString(userKey, tagName)
         
     #create new tag if tag doesn't exists
     tagColor = "Tag Exists" #returns this in the API call if the tag exists so it didnt need to add a new color to the tag
     if not tagOb:
         tagColor = choice(Tag.colorArr)
         tagOb = Tag(parent = ndb.Key(urlsafe=userKey), permanent = False, name=tagName, color = tagColor)
         tagOb.put()
     
     
     tagKey = tagOb.key.urlsafe()
     
     returnArray = []
     
     returnArray.append(user_event.UserEvent.addTagObToEvent(eventKey, userKey, tagKey))
     returnArray.append(tagColor)
         
     #adds tag to userEvent
     return returnArray
    def Lesk(self, word, pos, pre_words, post_words,softScoring=False,alpha=0.5):
        scores = dict()
        # what to do if word doesnt exist?
        # TODO - need a way to store different pos of a word and retrieve them accordingly
        list_of_senses = []
        if word in self.dict:
            wordpos, list_of_senses = self.dict[word]
        else: # we don't know pos, senses, or definition. should not happen from test data
            lst_tup = dict()
            i = 1
            for synset in wn.synsets(word): # may be empty!
                # clean definition - check this
                lst_tup.update({i : (utilities.cleanString(synset.definition), [], [])})
                i += 1
            self.dict.update({word: (synset.pos, lst_tup)})
            list_of_senses = self.dict[word][1]

        #print list_of_senses
        for sense in list_of_senses: # sense is int. length list_of_senses is approx 5
            #print "Senses:\n", sense, list_of_senses[sense]
            overlap = self.computeOverlap(word, pos, list_of_senses[sense], pre_words, post_words)
            #print "overlap for sense", sense, ":", overlap
            #print "!!!!", sense, "Overlap:", overlap
            scores[sense] = overlap+alpha
        #print "Best Sense is: ", best_sense
        if softScoring:
            values = []
            total = sum(scores.values())
            for key in scores:
                scores[key] = scores[key] / float(total)
        return scores, alpha
Exemple #5
0
    def getUserEventForEventFromTagName(cls, userKey, tagName, eventKey):
        
        #clean tag name
        tagName = utilities.cleanString(tagName)
        
        tagOb = cls.getTagObjectFromString(userKey, tagName)
        
        #tries to find the existing tag with the tag name for the event

        userEventOb = user_event.UserEvent.getUserEventWithTagObAndEventKey(userKey = userKey, tagKey = tagOb.key.urlsafe(), eventKey = eventKey)
        
        if not userEventOb:
            return False
        
        return userEventOb
Exemple #6
0
 def getTagObjectFromString(cls, userKey, tagName):
     
     #clean tag name
     tagName = utilities.cleanString(tagName)
     
     tagOb = None
     
     #tries finding existing tag
     tagObList = cls.query(ancestor=ndb.Key(urlsafe=userKey)).filter(cls.name == tagName).fetch()
     for tagOb in tagObList:
         tagOb = tagOb
     
     #no tag by tagName
     if tagOb is None:
         return False
     
     return tagOb
def getReviewList(datafile, defaultToZero = False):
    reviews = [] # collect reviews in a list
    # review components
    name,lines = None, list()
    with open(datafile) as f:
        for line in f:
            line = tuple(line.strip().split('\t'))
            if len(line) == 1 and line[0] != '':
                # if line has no tab character, start of a new review.
                name = line[0]
            elif len(line) == 1 and line[0] == '':
                # end of review. collect results, append
                reviews.append((name, lines))
                name,lines = None, list()
            else: # len(line) == 2
                # remove stopwords, lemmatize, other cleans...
                sentiment = strToSentiment(line[0], defaultToZero)
                sentence = utilities.cleanString(line[1])
                lines.append((sentiment, sentence))
    return reviews
 def getOverlaps(self, def_words, context_word, listwords):
     context_overlap = 0 # if context_word in def_words
     overlap = 0 # overlap of contextword def words and def_words
     consecOverlap = 0
     if context_word in def_words:
         context_overlap += 1
     if context_word in self.dict:
         (pos, subdict) = self.dict[context_word]
         for sense in subdict:
             (worddef, examples, wnints) = subdict[sense]
             lst = worddef.split(' ')
             consecOverlap += self.consecutiveOverlaps(def_words, lst)
             for wrd in lst:
                 if wrd.strip() in def_words:
                     overlap += 1
     else: # look up in wordnet
         for synset in wn.synsets(context_word):
             defin = utilities.cleanString(synset.definition).split(' ')
             consecOverlap += self.consecutiveOverlaps(def_words, defin)
             for wrd in defin:
                 if wrd.strip() in def_words:
                     overlap += 1
     return context_overlap, overlap, consecOverlap
Exemple #9
0
                token_test.loc[token_test.vanResponse == True, 'prob'].values,
                title="Van Token Flagging",
                lower_threshold=.4,
                upper_threshold=.75)

pd.DataFrame(token_model.feature_importances_, index=Features)

################################
# Model for whether the response has names in it
################################

# Score tokens for each relevant voter response
labeled['names_extract'] = ""
threshold = 0.5
for i, row in labeled.iterrows():
    if (cleanString(row['voterFinal']) == "" or row['voterFinal'] is None) and \
            (cleanString(row['voterPost']) == "" or row['voterPost'] is None):
        continue
    finalCandidates, finalFeatures = get_token_features(
        row['voterFinal'], row['tripleMessage'], van_token_vectorizer,
        model_token_bow, english_dict, census_dict, census_last_dict,
        token_counter)
    postCandidates, postFeatures = get_token_features(row['voterPost'],
                                                      row['tripleMessage'],
                                                      van_token_vectorizer,
                                                      model_token_bow,
                                                      english_dict,
                                                      census_dict,
                                                      census_last_dict,
                                                      token_counter,
                                                      is_post_response=True)
def main(args):

    # Set home directory
    home = Path(args.home_folder)
    
    # Read in data either from flat file or civis
    if args.use_civis:
        home = Path("./Projects/NLP/SMS_Annotation/")
        van = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name)
    else:
        van = load_flat_file(home, args.input_data_filename)
    
    # Thresholds for manual review and labeling
    LOWER_BOUND = .4 
    UPPER_BOUND = .75

    print("Loading Models...")


    pickle_file = Path(home, "Models", "annotation_models.pkl")
    with open(pickle_file, "rb") as f:
        # N-Gram Featurizers
        response_vectorizer = pickle.load(f)
        final_vectorizer = pickle.load(f)
        post_vectorizer = pickle.load(f)

        # Logistic Regressions
        token_model = pickle.load(f)
        model_tripler = pickle.load(f)
        model_name = pickle.load(f)
        model_opt = pickle.load(f)
        model_wrongnumber = pickle.load(f)
        token_counter = pickle.load(f)

    print("Loading Data...")

    # US Census Data
    census = pd.read_csv(Path(home, "Utility_Data", "census_first_names_all.csv"))
    census_dict = {}
    for i, row in census.iterrows():
        census_dict[row['name']] = np.log(row['census_count'])

    # Last Name Data
    census_last = pd.read_csv(Path(home, "Utility_Data", "census_last_names_all.csv"))
    census_last_dict = {}
    for i, row in census_last.iterrows():
        census_last_dict[row['name']] = np.log(row['census_count'])

    # US Word Freq Data
    english = pd.read_csv(Path(home, "Utility_Data", "english.csv"))
    english_dict = {}
    for i, row in english.iterrows():
        english_dict[row['name']] = row['freq']

    # Clean NA values
    van.loc[van.notetext.isnull(), 'notetext'] = ""
    van.loc[van.contactname.isnull(), 'contactname'] = ""

    # Get Extracted Names
    names_extract = []
    manual_review = []
    for i, row in van.iterrows():
        response = row['notetext']
        if (cleanString(response) == ""):
            names_extract.append("")
            manual_review.append(False)
            continue
        X_tokens_row = pd.DataFrame(
            get_token_features(response, row['contactname'], english_dict, census_dict, census_last_dict, token_counter)
            ).values.astype(float)
        y_pred = token_model.predict_proba(X_tokens_row)
        doc = get_doc(response)
        clean_tokens = [normalize_token(t.string) for t in doc] 
        clean_tokens = [t for t in clean_tokens if not t == ""]
        
        # Extract any plausible tokens
        names_extract.append(extract_good_tokens(
                clean_tokens = clean_tokens, 
                triple_message = row['contactname'],
                y_pred = y_pred, 
                response = response, 
                threshold = LOWER_BOUND
                ))
        
        # Send to Manual Review if there are any tokens in the unclear range
        manual_review.append(((y_pred[:,1] > LOWER_BOUND) & (y_pred[:,1] < UPPER_BOUND)).sum() > 0)
    van['names_extract'] = names_extract
    van['manual_review'] = manual_review

    # Get those with confirmed names
    triplers = van.loc[(van.manual_review == False) & ~(van.names_extract == "")][['vanid', 'names_extract']]
    review = van.loc[van.manual_review == True][['vanid', 'contactname', 'notetext', 'names_extract']]
    
    # Write out annotated files
    if args.use_civis:
        export_civis(triplers, args.output_filename.replace(".csv", ""), args.database_name)
        export_civis(review, args.manual_review_filename.replace(".csv", ""), args.database_name)
    else:
        triplers.to_csv(Path(home, "Output_Data", args.output_filename), index = False, encoding = 'latin1')
        review.to_csv(Path(home, "Output_Data", args.manual_review_filename), index = False, encoding = 'latin1')