Beispiel #1
0
    def test_cosine_similarity_same(self):
        text1 = "happy birthday"
        text2 = "happy birthday"
        cs = CosineSimilarity.cosine_similarity_of(text1, text2)

        #strings used due to floating number problem.
        self.assertEqual("%.2f" % cs, "1.00")
    def get_recommendations(keywords):
        df = pd.read_csv('richCityData.csv')

        score_dict = {}

        for index, row in df.iterrows():
            score_dict[index] = CosineSimilarity.cosine_similarity_of(row['description'], keywords)

        #sort cities by score and index.
        sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True)

        counter = 0

        #create an empty results data frame.
        resultDF = pd.DataFrame(columns=('city', 'popularity', 'description', 'image'))

        #get highest scored 5 cities.
        for i in sorted_scores:
            print(i[0], i[1])
            resultDF = resultDF.append({'city': df.iloc[i[0]]['city'], 'popularity': df.iloc[i[0]]['popularity'], 'description': df.iloc[i[0]]['description'], 'image': df.iloc[i[0]]['image']}, ignore_index=True)
            counter += 1

            if counter>4:
                break

        #convert DF to json.
        json_result = json.dumps(resultDF.to_dict('records'))
        return json_result
Beispiel #3
0
    def test_cosine_similarity_different(self):
        text1 = "hello sir"
        text2 = "good afternoon"
        cs = CosineSimilarity.cosine_similarity_of(text1, text2)

        #strings used due to floating number problem.
        self.assertEqual("%.2f" % cs, "0.00")
Beispiel #4
0
    def get_recommendations_include_rating_count_threshold_positive_negative_reviews(
            keywords):
        df = pd.read_csv('city_data_cleared.csv')

        score_dict = {}

        for index, row in df.iterrows():
            cs_score = CosineSimilarity.cosine_similarity_of(
                row['description'], keywords)

            rating = row['rating']
            rating_count = row['rating_count']
            positive_review_count = row['positive_review']
            negative_review_count = row['negative_review']
            rating_contribution = RatingExtractor.get_rating_weight_with_count_and_reviews(
                rating, rating_count, positive_review_count,
                negative_review_count)

            final_score = RecommenderEngine.calculate_final_score(
                cs_score, rating_contribution)

            score_dict[index] = final_score

        #sort cities by score and index.
        sorted_scores = sorted(score_dict.items(),
                               key=operator.itemgetter(1),
                               reverse=True)

        counter = 0

        #create an empty results data frame.
        resultDF = pd.DataFrame(columns=('city', 'popularity', 'description',
                                         'score'))

        #get highest scored 5 cities.
        for i in sorted_scores:
            #print index and score of the city.
            #print(i[0], i[1])
            resultDF = resultDF.append(
                {
                    'city': df.iloc[i[0]]['city'],
                    'popularity': df.iloc[i[0]]['popularity'],
                    'description': df.iloc[i[0]]['description'],
                    'score': i[1]
                },
                ignore_index=True)
            counter += 1

            if counter > 4:
                break

        #convert DF to json.
        json_result = json.dumps(resultDF.to_dict('records'))
        return json_result
Beispiel #5
0
def analyse_with_cosine():
    stats = DocsStats()
    rude_comments = SiteComment.rude_comments()
    rude_docs = list()
    for comment in rude_comments:
        rude_docs.append(
            Document(stats, comment.id, comment.body, comment.processed_body))

    unverified_comments = SiteComment.comments_for_analysis()
    unverified_docs = list()
    for comment in unverified_comments:
        unverified_docs.append(
            Document(stats, comment.id, comment.body, comment.processed_body))

    stats.calculate_idfs()
    stats.vectorise_documents()

    cosine = CosineSimilarity(rude_docs)
    rude_cluster = cosine.biggest_cluster()
    for item in rude_cluster:
        print("- ", item.body, "\r\n")
Beispiel #6
0
    def get_response(self, question, no_responses):

        responses = []
        action = """"""
        similarity_before = 0.2

        cosine_similarity = CosineSimilarity()
        
        if not self.current_intention:
            for intention in self.intentions:
                for dialog in intention.get_dialogs():
                    for defined_question in dialog.get_questions():
                        similarity = cosine_similarity.compare(defined_question, question)
                        if similarity > similarity_before:
                            similarity_before = similarity
                            self.current_intention = intention
                            responses = dialog.get_responses()
                            action = dialog.get_action()
        else:
            for dialog in self.current_intention.get_dialogs():
                for defined_question in dialog.get_questions():
                    similarity = cosine_similarity.compare(defined_question, question)
                    if similarity > similarity_before:
                        similarity_before = similarity
                        responses = dialog.get_responses()
                        action = dialog.get_action()
                    
        if action:
            exec(action.replace("INPUT", "\"" + question + "\""))
        if len(responses) > 0:
            if type(responses) == str:
                return responses
            return responses[randint(0,len(responses)-1)]
        else:
            if not self.current_intention:
                return no_responses[randint(0,len(no_responses)-1)]
            else:
                self.current_intention = None
                return self.get_response(question)
Beispiel #7
0
def get_recommendations(resume, jobs_df):
    score_dict = {}

    for index, row in jobs_df.iterrows():
        score_dict[index] = CosineSimilarity.cosine_similarity_of(
            row['description_cleaned'], resume)

    # Sort descriptions by score and index
    sorted_scores = sorted(score_dict.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
    counter = 0

    # Create results data frame
    resultDF = pd.DataFrame(columns=[
        'Job Index', 'Company', 'Title', 'Location', 'Description',
        'Job Description'
    ])  # , 'score'])

    # Get the 10 jobs with the highest similarity scores
    for i in sorted_scores:
        # print index & score of the job description
        resultDF = resultDF.append(
            {
                'Description': jobs_df.iloc[i[0]]['job_description'],
                'Title': jobs_df.iloc[i[0]]['title'],
                'Company': jobs_df.iloc[i[0]]['company_name'],
                'Location': jobs_df.iloc[i[0]]['location'],
                'Job Index': jobs_df.iloc[i[0]]['Unnamed: 0']
            },
            ignore_index=True)
        # 'score': i[1]}, ignore_index=True)
        counter += 1

        if counter > 10:
            break

    json_result = json.dumps(resultDF.to_dict('records'))
    resultDF.fillna('', inplace=True)
    return resultDF
    def get_rating_recommendations(keywords):
        df = pd.read_csv('ratingRichCityData.csv')

        score_dict = {}

        for index, row in df.iterrows():
            cs = CosineSimilarity.cosine_similarity_of(row['description'], keywords)
            rating = row['rating']
            rating_count = row['rating_count']
            positive_review_count = row['positive_review']
            negative_review_count = row['negative_review']

            rat_value = RatingExtractor.get_rating_with_count_and_reviews(rating,rating_count,positive_review_count,negative_review_count)

            score = RecommenderEngine.calculate_score_from(cs,rat_value)

            score_dict[index] = score

        sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True)

        counter = 0

        resultDF = pd.DataFrame(columns=('city', 'popularity', 'description', 'image'))

        #get highest scored 5 cities.
        for i in sorted_scores:
            print(i[0], i[1])
            resultDF = resultDF.append({'city': df.iloc[i[0]]['city'], 'popularity': df.iloc[i[0]]['popularity'], 'description': df.iloc[i[0]]['description'], 'image': df.iloc[i[0]]['image']}, ignore_index=True)
            counter += 1

            if counter>4:
                break

        #convert DF to json.
        json_result = json.dumps(resultDF.to_dict('records'))
        return json_result
Beispiel #9
0
from flask import Flask, render_template, request
import time
from Corpus import NGramGenerator
from cosine_similarity import CosineSimilarity

app = Flask(__name__)

myGenerator = NGramGenerator()
myGenerator.generateIndex("cleaned_files")

# for k,v in myGenerator.one_gram_corpus.items():
#     print k
#     for ik, iv in v.docTermFreqDict.items():
#         print " %d => %d"%(ik,iv)

cs = CosineSimilarity()

cs.createMatix(myGenerator.one_gram_corpus, myGenerator.bi_gram_index,
               myGenerator.tri_gram_index)


@app.route("/")
def main():
    return render_template('/index.html')


@app.route('/search', methods=['POST'])
def search():
    resultArray = []
    now = time.time()
    query_word_and_tf = defaultdict(int)
Beispiel #10
0
from Generator import NGramGenerator
from QueryListGenerator import QueryProcessor
from cosine_similarity import CosineSimilarity
from evaluation import Effectiveness

myGenerator = NGramGenerator()

myGenerator.generate_cleaned_files(
    "/Users/ashishbulchandani/PycharmProjects/final-project/cacm",
    "/Users/ashishbulchandani/PycharmProjects/final-project/cleaned_files")
myGenerator.generateUnigramCorpus(
    "/Users/ashishbulchandani/PycharmProjects/final-project/cleaned_files")

comparer = CosineSimilarity(myGenerator.one_gram_corpus,
                            myGenerator.total_docs,
                            "/task1_cosine_similarity_run.txt")
queryProcessor = QueryProcessor()
querie_dict = queryProcessor.get_query_list(
    '/Users/ashishbulchandani/PycharmProjects/final-project/cacm.query')

# Input, parse the query and generate weight for each term
query_word_and_tf = defaultdict(int)

eval = Effectiveness()
eval.setFilePaths(
    "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/Map.txt",
    "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/Mrr.txt",
    "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/p_at_k.txt",
    "/Users/ashishbulchandani/PycharmProjects/final-project/run_task1/evalution_cosine_similarity/table_precision_recal.txt",
    "/Users/ashishbulchandani/PycharmProjects/final-project/cacm.rel.txt")
Beispiel #11
0
        # print(data[i], x, label[i])
        if x != label[i]:
            errCount += 1
            print(data[i], x, label[i])
    return (errCount / len(bagOfWords)) * 100

def classification(test_data, test_bagOfWords, original_data, original_labels, original_bagOfWords, k=3):
    """
    kNN Model Based Classifier for test data (actual data)
    """
    for i in range(len(test_bagOfWords)):
        x = classify(np.array(test_bagOfWords[i]), np.array(original_bagOfWords), original_labels, k)
        print(test_data[i], x)

if __name__ == '__main__':
    c = CosineSimilarity()
    t = TextProcessing()
    f = FileProcessor('experience_classification', 'train')
    data, label = f.cleanFile()
    feature_in_category = t.get_feature_in_category(data, label)
    local_neighbour = t.get_local_neighbours(feature_in_category)
    global_neighbour = t.get_global_neighbours(feature_in_category)
    global_words = [global_neighbour[i][0] for i in global_neighbour]
    sorted_local_neighbours = t.get_local_neighbours_sorted(feature_in_category)
    # for i in sorted_local_neighbours:
    #     print(sorted_local_neighbours[i])

    ## Training Data: 4-6%
    revised_data = model_construction(data, global_words, sorted_local_neighbours)
    # for i in revised_data:
    #     print(i)
def test_recommender():
    """
    """
    pass


def recommend():
    """
    """
    pass


if __name__ == '__main__':
    t = TextProcessing()
    c = CosineSimilarity()
    byte_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description'
    byte_file = FileProcessor(byte_file_path, 'train')
    items = byte_file.readByteFile()
    items = [item for item in items if item != '\n' and item != None]

    # big_list = [] # education + experience list
    # for i in items:
    #     print(i)
    #     edu = get_education(i)
    #     exp = get_experience(i)
    #     big_list.append(edu)
    #     big_list.append(exp)
    #
    # vocabSet = c.vocabSet(big_list)
    # wordVectors = [c.bag_of_words(vocabSet, i) for i in big_list]
Beispiel #13
0
    def test_cosine_similarity_some(self):
        text1 = "apple banana orange"
        text2 = "orange berry ananas"
        cs = CosineSimilarity.cosine_similarity_of(text1, text2)

        self.assertEqual("%.2f" % cs, "0.33")
Beispiel #14
0
    names, count = [], []
    for val, key in data_count.items():
        if key > 1:
            names.append(val)
            count.append(key)

    fig1, ax1 = plt.subplots()
    ax1.pie(count, labels=names, autopct='%1.1f%%', shadow=True, startangle=90)
    ax1.axis(
        'equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    # plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    original_data = read_byteFile(
        '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description'
    )
    original_data = [i for i in original_data if i != '\n' and i != None]
    edu, degree = education_data(original_data)
    edu_dict = count(edu)
    pie_graph(edu_dict)
    c = CosineSimilarity()
    # experience, company = experience_data(original_data, 'experience_unclassified')
    # classification(experience, 'experience_classification', 'experience_unclassified')

    ######################################################
    # TODO: transfer all of this file shit to linkedin file....
    ######################################################
    # write_to_file('experience_unclassified', experience)
Beispiel #15
0
 def __init__(self, measure, measure_const=MeasureConstants()):
     self.measure = None
     if measure == "cosine":
         self.measure = CosineSimilarity()
     if measure == "bag_of_words":
         self.measure = BOWSimilarity()
Beispiel #16
0
myGenerator = NGramGenerator()

myGenerator.generate_stopped_cleaned_files(
    "/Users/ashishbulchandani/PycharmProjects/SE_Web/wiki_webpages",
    "/Users/ashishbulchandani/PycharmProjects/SE_Web/cleaned_files",
    "/Users/ashishbulchandani/PycharmProjects/SE_Web/common_words.txt")

myGenerator.generateIndex(
    "/Users/ashishbulchandani/PycharmProjects/SE_Web/cleaned_files")

# for k,v in myGenerator.one_gram_corpus.items():
#     print k
#     for ik, iv in v.docTermFreqDict.items():
#         print " %d => %d"%(ik,iv)

cs = CosineSimilarity()

cs.createMatix(myGenerator.one_gram_corpus, myGenerator.bi_gram_index,
               myGenerator.tri_gram_index)
# Input, parse the query and generate weight for each term
query_word_and_tf = defaultdict(int)
query_number = 1
var = ""


def getFormatedDockey(docKey):
    space = " "
    for i in range(60 - len(docKey)):
        space += " "
    docKey += space
    return docKey