def test_cosine_similarity_same(self): text1 = "happy birthday" text2 = "happy birthday" cs = CosineSimilarity.cosine_similarity_of(text1, text2) #strings used due to floating number problem. self.assertEqual("%.2f" % cs, "1.00")
def get_recommendations(keywords): df = pd.read_csv('richCityData.csv') score_dict = {} for index, row in df.iterrows(): score_dict[index] = CosineSimilarity.cosine_similarity_of(row['description'], keywords) #sort cities by score and index. sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True) counter = 0 #create an empty results data frame. resultDF = pd.DataFrame(columns=('city', 'popularity', 'description', 'image')) #get highest scored 5 cities. for i in sorted_scores: print(i[0], i[1]) resultDF = resultDF.append({'city': df.iloc[i[0]]['city'], 'popularity': df.iloc[i[0]]['popularity'], 'description': df.iloc[i[0]]['description'], 'image': df.iloc[i[0]]['image']}, ignore_index=True) counter += 1 if counter>4: break #convert DF to json. json_result = json.dumps(resultDF.to_dict('records')) return json_result
def test_cosine_similarity_different(self): text1 = "hello sir" text2 = "good afternoon" cs = CosineSimilarity.cosine_similarity_of(text1, text2) #strings used due to floating number problem. self.assertEqual("%.2f" % cs, "0.00")
def get_recommendations_include_rating_count_threshold_positive_negative_reviews( keywords): df = pd.read_csv('city_data_cleared.csv') score_dict = {} for index, row in df.iterrows(): cs_score = CosineSimilarity.cosine_similarity_of( row['description'], keywords) rating = row['rating'] rating_count = row['rating_count'] positive_review_count = row['positive_review'] negative_review_count = row['negative_review'] rating_contribution = RatingExtractor.get_rating_weight_with_count_and_reviews( rating, rating_count, positive_review_count, negative_review_count) final_score = RecommenderEngine.calculate_final_score( cs_score, rating_contribution) score_dict[index] = final_score #sort cities by score and index. sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True) counter = 0 #create an empty results data frame. resultDF = pd.DataFrame(columns=('city', 'popularity', 'description', 'score')) #get highest scored 5 cities. for i in sorted_scores: #print index and score of the city. #print(i[0], i[1]) resultDF = resultDF.append( { 'city': df.iloc[i[0]]['city'], 'popularity': df.iloc[i[0]]['popularity'], 'description': df.iloc[i[0]]['description'], 'score': i[1] }, ignore_index=True) counter += 1 if counter > 4: break #convert DF to json. json_result = json.dumps(resultDF.to_dict('records')) return json_result
def analyse_with_cosine(): stats = DocsStats() rude_comments = SiteComment.rude_comments() rude_docs = list() for comment in rude_comments: rude_docs.append( Document(stats, comment.id, comment.body, comment.processed_body)) unverified_comments = SiteComment.comments_for_analysis() unverified_docs = list() for comment in unverified_comments: unverified_docs.append( Document(stats, comment.id, comment.body, comment.processed_body)) stats.calculate_idfs() stats.vectorise_documents() cosine = CosineSimilarity(rude_docs) rude_cluster = cosine.biggest_cluster() for item in rude_cluster: print("- ", item.body, "\r\n")
def get_response(self, question, no_responses): responses = [] action = """""" similarity_before = 0.2 cosine_similarity = CosineSimilarity() if not self.current_intention: for intention in self.intentions: for dialog in intention.get_dialogs(): for defined_question in dialog.get_questions(): similarity = cosine_similarity.compare(defined_question, question) if similarity > similarity_before: similarity_before = similarity self.current_intention = intention responses = dialog.get_responses() action = dialog.get_action() else: for dialog in self.current_intention.get_dialogs(): for defined_question in dialog.get_questions(): similarity = cosine_similarity.compare(defined_question, question) if similarity > similarity_before: similarity_before = similarity responses = dialog.get_responses() action = dialog.get_action() if action: exec(action.replace("INPUT", "\"" + question + "\"")) if len(responses) > 0: if type(responses) == str: return responses return responses[randint(0,len(responses)-1)] else: if not self.current_intention: return no_responses[randint(0,len(no_responses)-1)] else: self.current_intention = None return self.get_response(question)
def get_recommendations(resume, jobs_df): score_dict = {} for index, row in jobs_df.iterrows(): score_dict[index] = CosineSimilarity.cosine_similarity_of( row['description_cleaned'], resume) # Sort descriptions by score and index sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True) counter = 0 # Create results data frame resultDF = pd.DataFrame(columns=[ 'Job Index', 'Company', 'Title', 'Location', 'Description', 'Job Description' ]) # , 'score']) # Get the 10 jobs with the highest similarity scores for i in sorted_scores: # print index & score of the job description resultDF = resultDF.append( { 'Description': jobs_df.iloc[i[0]]['job_description'], 'Title': jobs_df.iloc[i[0]]['title'], 'Company': jobs_df.iloc[i[0]]['company_name'], 'Location': jobs_df.iloc[i[0]]['location'], 'Job Index': jobs_df.iloc[i[0]]['Unnamed: 0'] }, ignore_index=True) # 'score': i[1]}, ignore_index=True) counter += 1 if counter > 10: break json_result = json.dumps(resultDF.to_dict('records')) resultDF.fillna('', inplace=True) return resultDF
def get_rating_recommendations(keywords): df = pd.read_csv('ratingRichCityData.csv') score_dict = {} for index, row in df.iterrows(): cs = CosineSimilarity.cosine_similarity_of(row['description'], keywords) rating = row['rating'] rating_count = row['rating_count'] positive_review_count = row['positive_review'] negative_review_count = row['negative_review'] rat_value = RatingExtractor.get_rating_with_count_and_reviews(rating,rating_count,positive_review_count,negative_review_count) score = RecommenderEngine.calculate_score_from(cs,rat_value) score_dict[index] = score sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True) counter = 0 resultDF = pd.DataFrame(columns=('city', 'popularity', 'description', 'image')) #get highest scored 5 cities. for i in sorted_scores: print(i[0], i[1]) resultDF = resultDF.append({'city': df.iloc[i[0]]['city'], 'popularity': df.iloc[i[0]]['popularity'], 'description': df.iloc[i[0]]['description'], 'image': df.iloc[i[0]]['image']}, ignore_index=True) counter += 1 if counter>4: break #convert DF to json. json_result = json.dumps(resultDF.to_dict('records')) return json_result
from flask import Flask, render_template, request import time from Corpus import NGramGenerator from cosine_similarity import CosineSimilarity app = Flask(__name__) myGenerator = NGramGenerator() myGenerator.generateIndex("cleaned_files") # for k,v in myGenerator.one_gram_corpus.items(): # print k # for ik, iv in v.docTermFreqDict.items(): # print " %d => %d"%(ik,iv) cs = CosineSimilarity() cs.createMatix(myGenerator.one_gram_corpus, myGenerator.bi_gram_index, myGenerator.tri_gram_index) @app.route("/") def main(): return render_template('/index.html') @app.route('/search', methods=['POST']) def search(): resultArray = [] now = time.time() query_word_and_tf = defaultdict(int)
from Generator import NGramGenerator from QueryListGenerator import QueryProcessor from cosine_similarity import CosineSimilarity from evaluation import Effectiveness myGenerator = NGramGenerator() myGenerator.generate_cleaned_files( "/Users/ashishbulchandani/PycharmProjects/final-project/cacm", "/Users/ashishbulchandani/PycharmProjects/final-project/cleaned_files") myGenerator.generateUnigramCorpus( "/Users/ashishbulchandani/PycharmProjects/final-project/cleaned_files") comparer = CosineSimilarity(myGenerator.one_gram_corpus, myGenerator.total_docs, "/task1_cosine_similarity_run.txt") queryProcessor = QueryProcessor() querie_dict = queryProcessor.get_query_list( '/Users/ashishbulchandani/PycharmProjects/final-project/cacm.query') # Input, parse the query and generate weight for each term query_word_and_tf = defaultdict(int) eval = Effectiveness() eval.setFilePaths( "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/Map.txt", "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/Mrr.txt", "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/p_at_k.txt", "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/table_precision_recal.txt", "/Users/ashishbulchandani/PycharmProjects/final-project/cacm.rel.txt")
# print(data[i], x, label[i]) if x != label[i]: errCount += 1 print(data[i], x, label[i]) return (errCount / len(bagOfWords)) * 100 def classification(test_data, test_bagOfWords, original_data, original_labels, original_bagOfWords, k=3): """ kNN Model Based Classifier for test data (actual data) """ for i in range(len(test_bagOfWords)): x = classify(np.array(test_bagOfWords[i]), np.array(original_bagOfWords), original_labels, k) print(test_data[i], x) if __name__ == '__main__': c = CosineSimilarity() t = TextProcessing() f = FileProcessor('experience_classification', 'train') data, label = f.cleanFile() feature_in_category = t.get_feature_in_category(data, label) local_neighbour = t.get_local_neighbours(feature_in_category) global_neighbour = t.get_global_neighbours(feature_in_category) global_words = [global_neighbour[i][0] for i in global_neighbour] sorted_local_neighbours = t.get_local_neighbours_sorted(feature_in_category) # for i in sorted_local_neighbours: # print(sorted_local_neighbours[i]) ## Training Data: 4-6% revised_data = model_construction(data, global_words, sorted_local_neighbours) # for i in revised_data: # print(i)
def test_recommender(): """ """ pass def recommend(): """ """ pass if __name__ == '__main__': t = TextProcessing() c = CosineSimilarity() byte_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description' byte_file = FileProcessor(byte_file_path, 'train') items = byte_file.readByteFile() items = [item for item in items if item != '\n' and item != None] # big_list = [] # education + experience list # for i in items: # print(i) # edu = get_education(i) # exp = get_experience(i) # big_list.append(edu) # big_list.append(exp) # # vocabSet = c.vocabSet(big_list) # wordVectors = [c.bag_of_words(vocabSet, i) for i in big_list]
def test_cosine_similarity_some(self): text1 = "apple banana orange" text2 = "orange berry ananas" cs = CosineSimilarity.cosine_similarity_of(text1, text2) self.assertEqual("%.2f" % cs, "0.33")
names, count = [], [] for val, key in data_count.items(): if key > 1: names.append(val) count.append(key) fig1, ax1 = plt.subplots() ax1.pie(count, labels=names, autopct='%1.1f%%', shadow=True, startangle=90) ax1.axis( 'equal') # Equal aspect ratio ensures that pie is drawn as a circle. # plt.tight_layout() plt.show() if __name__ == '__main__': original_data = read_byteFile( '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description' ) original_data = [i for i in original_data if i != '\n' and i != None] edu, degree = education_data(original_data) edu_dict = count(edu) pie_graph(edu_dict) c = CosineSimilarity() # experience, company = experience_data(original_data, 'experience_unclassified') # classification(experience, 'experience_classification', 'experience_unclassified') ###################################################### # TODO: transfer all of this file shit to linkedin file.... ###################################################### # write_to_file('experience_unclassified', experience)
def __init__(self, measure, measure_const=MeasureConstants()): self.measure = None if measure == "cosine": self.measure = CosineSimilarity() if measure == "bag_of_words": self.measure = BOWSimilarity()
myGenerator = NGramGenerator() myGenerator.generate_stopped_cleaned_files( "/Users/ashishbulchandani/PycharmProjects/SE_Web/wiki_webpages", "/Users/ashishbulchandani/PycharmProjects/SE_Web/cleaned_files", "/Users/ashishbulchandani/PycharmProjects/SE_Web/common_words.txt") myGenerator.generateIndex( "/Users/ashishbulchandani/PycharmProjects/SE_Web/cleaned_files") # for k,v in myGenerator.one_gram_corpus.items(): # print k # for ik, iv in v.docTermFreqDict.items(): # print " %d => %d"%(ik,iv) cs = CosineSimilarity() cs.createMatix(myGenerator.one_gram_corpus, myGenerator.bi_gram_index, myGenerator.tri_gram_index) # Input, parse the query and generate weight for each term query_word_and_tf = defaultdict(int) query_number = 1 var = "" def getFormatedDockey(docKey): space = " " for i in range(60 - len(docKey)): space += " " docKey += space return docKey