def find_user_by_id(user_id): ''' (str) --> dict Returns a user's info given their id ''' try: db = database() if type(user_id) != str: raise TypeError data = db.users.find_one({"_id": user_id}) if data: return data else: raise ValueError except TypeError: raise TypeError( bcolors.FAIL + "Invalid Input. Enter a valid user id as a string to use this function" + bcolors.ENDC) return False except ValueError: raise ValueError(bcolors.FAIL + "Invalid User ID" + bcolors.ENDC) return False
def find_user_by_email(email): ''' (str) --> dict Returns a user's info given their email ''' try: db = database() data = [] if type(email) != str: raise TypeError data = db.users.find_one({"emails.address": email}) if data: return data else: raise ValueError except TypeError: raise TypeError( bcolors.FAIL + "Invalid Input. Enter a valid email as a string to use this function" + bcolors.ENDC) return False except ValueError: raise ValueError(bcolors.FAIL + "No user found with that email" + bcolors.ENDC) return False
def find_user_by_name(name): ''' (str) --> list Returns a list of users info given a name. ''' try: start_time = time.time() db = database() data = [] if type(name) != str: raise TypeError name = name.lower() for i in db.users.find(): data.append(i) #seperates string into array of words temp_name = re.findall(r'\w+', name) user_list = [] #changes search parameters based on number of words in search if len(temp_name) == 1: #searches for name in the first and last names of people in the profile for i in range(len(data)): temp_profile = data[i].get("profile") if name in temp_profile.get("firstName").lower() or name in temp_profile.get("lastName").lower() : user_list = user_list + [data[i]] elif len(temp_name) == 2: #searches for the full name with exact match temp_firstName = temp_name[0] temp_lastName = temp_name[1] for i in range(len(data)): temp_profile = data[i].get("profile") if temp_firstName in temp_profile.get("firstName").lower() and temp_lastName in temp_profile.get("lastName").lower(): user_list = user_list + [data[i]] elif len(temp_name) > 2: #if more than three words are in name search, checks each word against first and last name for i in range(len(data)): for c in range(len(temp_name)): temp_profile = data[i].get("profile") if temp_name[c] in temp_profile.get("firstName").lower() or temp_name[c] in temp_profile.get("lastName").lower(): user_list = user_list + [data[i]] if len(user_list) == 0: raise ValueError for i in range(len(user_list)): print user_list[i] print "" return user_list except TypeError: raise TypeError(bcolors.FAIL + "Invalid Input. Enter a valid name as a string to use this function" + bcolors.ENDC) return False except ValueError: raise ValueError(bcolors.FAIL + "Name not found in database" + bcolors.ENDC)
def parse_user_site(user_id): ''' (str) --> bool Parses a user's site given a user id. ''' try: start_time = time.time() db = database() if type(user_id) != str and type(user_id) != unicode: raise TypeError user_id = str(user_id) user = db.users.find_one({"_id": user_id}) if user: url_temp = user.get("profile").get("url") else: raise ValueError print bcolors.OKGREEN + ("parsing through: " + url_temp) + bcolors.ENDC user_id = user.get("_id") title = get_title(url_temp) last_parsed = time.strftime("%c") db.users.update({"profile.url": url_temp}, { "$set": { "profile.title": title, "profile.lastParsed": last_parsed } }, False, True) # there will be 2 types of tags, # from website, and from pdf tags_temp = get_html(url_temp) # this is all words from the html tagsPDF_temp = get_pdf(url_temp) # this is all words from the pdf keyword_list = [] for keyword in tags_temp: try: print bcolors.OKGREEN + ( "Incremented: %s to %d" % (keyword, increment_word(keyword))) + bcolors.ENDC keyword_list.append(keyword) except Exception, e: print bcolors.FAIL + "Invalid Entry" + bcolors.ENDC print e for keyword in tagsPDF_temp: try: if keyword not in keyword_list: print bcolors.OKGREEN + ( "Incremented: %s to %d" % (keyword, increment_word(keyword))) + bcolors.ENDC except Exception, e: print bcolors.FAIL + "Invalid Entry" + bcolors.ENDC print e
def count_total_words(): """ () --> integer Counts the total number of words in the database that have been counted more than once, and returns an integer value. """ db = database() counter = 0 for i in db.word_count.find({"total" : {'$gt' : 1} }): counter += i.get("total") return counter
def count_distinct_words(): """ () --> integer Counts the number of distinct words in the database which have appeared more than once, and returns an integer value. """ db = database() counter = 0 for i in db.word_count.find({"total" : {'$gt' : 1} }): counter += 1 return counter
def calculate_keywords(): """ () --> list Returns a list of all the keywords in the database which are less than 0.8414 standard deviations above the mean (bottom 80%) """ db = database() data = [] for i in db.word_count.find(): data.append(i) sorted_list = [] avg_val = _average_count() std_dev = _std_count() num_list = [] # creates a list with all the total values for i in range(len(data)): num_list = num_list + [data[i].get("total")] #print num_list #creates a list that lists the number of standard deviations from the mean each index is std_list = [] for i in num_list: std_list.append((avg_val-i)/std_dev) #filters out any values greater than 0.8416 standard deviations above the mean from num_list for i in range(len(std_list)): if std_list[i] < -0.8416: #assigns 0 value to values out of range num_list[i] = 0 sorted_list = [] highest_val = max(num_list) word_list = [] #adds the highest name/value pair from num_list to sorted list then removes it from num_list while highest_val != 0: for i in range(len(data)): if data[i].get("total") == highest_val and data[i].get("word") not in word_list and num_list[i] != 0: sorted_list.append([data[i].get("word"),data[i].get("total")]) word_list.append(data[i].get("word")) num_list[i] = 0 highest_val = max(num_list) print sorted_list return sorted_list
def get_all_urls(): """ () --> list Returns a list of all the urls users have submitted. """ db = database() url_list = [] data = [] for i in db.users.find(): if i.get("profile").get("url"): url_list.append(i.get("profile").get("url")) return url_list
def parse_user_site(user_id): ''' (str) --> bool Parses a user's site given a user id. ''' try: start_time = time.time() db = database() if type(user_id) != str and type(user_id) != unicode: raise TypeError user_id = str(user_id) user = db.users.find_one({"_id" : user_id}) if user: url_temp = user.get("profile").get("url") else: raise ValueError print bcolors.OKGREEN + ("parsing through: " + url_temp) + bcolors.ENDC user_id = user.get("_id") title = get_title(url_temp) last_parsed = time.strftime("%c") db.users.update({"profile.url" : url_temp}, { "$set": {"profile.title" : title, "profile.lastParsed" : last_parsed}},False,True) # there will be 2 types of tags, # from website, and from pdf tags_temp = get_html(url_temp) # this is all words from the html tagsPDF_temp = get_pdf(url_temp) # this is all words from the pdf keyword_list = [] for keyword in tags_temp: try: print bcolors.OKGREEN + ("Incremented: %s to %d" % (keyword, increment_word(keyword))) + bcolors.ENDC keyword_list.append(keyword) except Exception, e: print bcolors.FAIL + "Invalid Entry" + bcolors.ENDC print e for keyword in tagsPDF_temp: try: if keyword not in keyword_list: print bcolors.OKGREEN + ("Incremented: %s to %d" % (keyword, increment_word(keyword))) + bcolors.ENDC except Exception, e: print bcolors.FAIL + "Invalid Entry" + bcolors.ENDC print e
def _std_count(): """ () --> integer Calculates the standard deviation of the number of repititions a words has in the database and returns an integer value. """ db = database() data = [] count_list = [] for i in db.word_count.find(): if i.get("total"): count_list.append(i.get("total")) print "Standard Deviation: " + str(numpy.std(count_list)) return numpy.std(count_list)
def _insert_word(word): """ (string) --> boolean Inserts a word into the database with a count of zero, and returns true, unless it already exists, in which case it returns false. """ db = database() data = db.word_count.find_one({"word" : word}) if data: raise LookupError("Word: %s already exists in database" % word) else: db.word_count.insert({"word": word, "total": 0}) return True
def delete_all_keywords(): ''' () --> bool Empties the keyword database for all users. ''' try: db = database() #deletes existing data db.keywords_coll.drop() db.word_count.drop() print "All Entries Deleted" return True except Exception as e: print e return False
def parse_all_users(): ''' () --> bool Parse the sites of all users in database. ''' try: db = database() #parses through every users' site for i in db.users.find(): try: parse_user_site(i.get("_id")) except Exception as e: print e pass return True except Exception as e: print e return False
def increment_word(word): """ (string) --> integer Adds one to the total number of a word unless it doesn't exist in the database, in which case it adds it, with an initial count of 1, returning an integer of it's count. """ db = database() word = str(word) data = db.word_count.find_one({"word" : word}) if not data: _insert_word(word) data = db.word_count.find_one({"word" : word}) if data: count = data.get("total") + 1 db.word_count.update({ "word": word}, {"word": word, "total": count}) return count
def delete_user_keywords(user_id): ''' (str) --> bool Deletes all the keywords of a user, given an id. ''' try: db = database() if type(user_id) != str and type(user_id) != unicode: raise TypeError data = db.users.find_one({"_id": user_id}) if data: db.keywords_coll.delete_many({"user_id": user_id}) else: raise ValueError print "User Entries Deleted" return True except TypeError, e: print e raise TypeError(bcolors.FAIL + "Invalid ID type" + bcolors.ENDC) return False
def find_user_by_id(user_id): ''' (str) --> dict Returns a user's info given their id ''' try: db = database() if type(user_id) != str: raise TypeError data = db.users.find_one({"_id" : user_id}) if data: return data else: raise ValueError except TypeError: raise TypeError(bcolors.FAIL + "Invalid Input. Enter a valid user id as a string to use this function" + bcolors.ENDC) return False except ValueError: raise ValueError(bcolors.FAIL + "Invalid User ID" + bcolors.ENDC) return False
def delete_user_keywords(user_id): ''' (str) --> bool Deletes all the keywords of a user, given an id. ''' try: db = database() if type(user_id) != str and type(user_id) != unicode: raise TypeError data = db.users.find_one({"_id" : user_id}) if data: db.keywords_coll.delete_many({"user_id": user_id}) else: raise ValueError print "User Entries Deleted" return True except TypeError, e: print e raise TypeError(bcolors.FAIL + "Invalid ID type" + bcolors.ENDC) return False
def find_user_by_email(email): ''' (str) --> dict Returns a user's info given their email ''' try: db = database() data = [] if type(email) != str: raise TypeError data = db.users.find_one({"emails.address" : email}) if data: return data else: raise ValueError except TypeError: raise TypeError(bcolors.FAIL +"Invalid Input. Enter a valid email as a string to use this function" + bcolors.ENDC) return False except ValueError: raise ValueError(bcolors.FAIL + "No user found with that email" + bcolors.ENDC) return False
def _find_skills(bodyStr): """ (str) --> array Searches through the html and pdf for skills in a txt file. """ try: if type(bodyStr) != str and type(bodyStr) != unicode: raise TypeError except TypeError: raise TypeError(bcolors.FAIL + "bodyStr must be a string" + bcolors.ENDC) db = database() skill_words = [] # find words that are matching with our skills vocabulary data = [] lstSkills = [] for i in db.skill_coll.find(): data = data + [i] for j in range(len(data)): lstSkills = lstSkills + data[j].get("skill") counter = True # we will first check if the person has "skill" defined, # and if "skill" is not found, then try to collect all the vocabularies appearing in the body. if ("skill" in bodyStr): skill_tag = bodyStr[bodyStr.find("skill"):] # if "skill" was defined, then it will collect vocabularies after the definition for i in lstSkills: temp_i = i temp_i = temp_i.replace('\n', "") temp_i = temp_i.replace('\r', "") temp_i = str(temp_i) # fix the format as usual if temp_i.lower() in skill_tag: skill_words = skill_words + [temp_i] # this is special case, when person has C and/or R skills. # those skills (one worded) can be found by adding spaces # eg. "c" -> " c " # Did not add "GO" language skill due to huge problem. if " c " in skill_tag and counter == True: skill_words = skill_words + ["C"] counter = False if " r " in skill_tag and counter == True: skill_words = skill_words + ["R"] counter = False else: () else: # it will do the same, but for entire bodyStr, which will take longer time. for i in lstSkills: temp_i = i temp_i = temp_i.replace('\n', "") temp_i = temp_i.replace('\r', "") temp_i = str(temp_i) if temp_i.lower() in bodyStr: skill_words = skill_words + [temp_i] if " c " in bodyStr and counter == True: skill_words = skill_words + ["C"] counter = False if " r " in bodyStr and counter == True: skill_words = skill_words + ["R"] counter = False else: () return skill_words
def find_user_by_name(name): ''' (str) --> list Returns a list of users info given a name. ''' try: start_time = time.time() db = database() data = [] if type(name) != str: raise TypeError name = name.lower() for i in db.users.find(): data.append(i) #seperates string into array of words temp_name = re.findall(r'\w+', name) user_list = [] #changes search parameters based on number of words in search if len(temp_name) == 1: #searches for name in the first and last names of people in the profile for i in range(len(data)): temp_profile = data[i].get("profile") if name in temp_profile.get("firstName").lower( ) or name in temp_profile.get("lastName").lower(): user_list = user_list + [data[i]] elif len(temp_name) == 2: #searches for the full name with exact match temp_firstName = temp_name[0] temp_lastName = temp_name[1] for i in range(len(data)): temp_profile = data[i].get("profile") if temp_firstName in temp_profile.get("firstName").lower( ) and temp_lastName in temp_profile.get("lastName").lower(): user_list = user_list + [data[i]] elif len(temp_name) > 2: #if more than three words are in name search, checks each word against first and last name for i in range(len(data)): for c in range(len(temp_name)): temp_profile = data[i].get("profile") if temp_name[c] in temp_profile.get("firstName").lower( ) or temp_name[c] in temp_profile.get("lastName").lower(): user_list = user_list + [data[i]] if len(user_list) == 0: raise ValueError for i in range(len(user_list)): print user_list[i] print "" return user_list except TypeError: raise TypeError( bcolors.FAIL + "Invalid Input. Enter a valid name as a string to use this function" + bcolors.ENDC) return False except ValueError: raise ValueError(bcolors.FAIL + "Name not found in database" + bcolors.ENDC)