def rating_prompt(original_data, filename='linkedin_rating_backend'):
    """
    (Couldn't use the original rating prompt because dealing with dictionaries)
    (Really only used once in the beginning...)
    Rating Prompt allowing me to originally rate people for recommender system...
    Parameter: -original_data: list of parsed and clean dictionaries
               -filename: the file where the dictionaries are located
            rating = [edu, exp, total]
    """
    li = []
    for i in original_data:
        print(i)
        edu = int(
            input('Enter a rating of education (based on title, 1 to 5): '))
        exp = int(
            input('Enter a rating of experience (based on title, 1 to 5): '))
        total = edu + exp
        i['rating'] = [edu, exp, total]
        li.append(i)  # new dictionaries

    f = FileProcessor(filename, 'train')
    f.eraseFile()
    # erase_file(filename)
    with open(filename, 'ab') as b:
        for i in li:
            pickle.dump(i, b)
            pickle.dump('\n', b)

    print('Done!')
def get_ratings(filename):
    """
    Return's a list of ratings
    Parameters:
        -filename: name of file or path where the rated items will be stored
    """
    f = FileProcessor(filename, 'train')
    lines = f.readByteFile()
    lines = [line for line in lines if line != '\n' and line != None]
    return [line.get('rating') for line in lines]
def description_duplicate_url_checker(url):
    """
    Check's if urls in raw_url file are not already in the link_people_description file
    """
    f = FileProcessor(
        '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description',
        'train')
    items = f.readByteFile()
    li = [i for i in items if i != '\n' and i != None]
    for i in li:
        link = i.get('header')[-1]
        if url == link:
            return True
    return False
def get_more_urls(org_file, dest_file, num=0):
    """
    Transfer's all url's from the raw_url file (same as the js file) to a new file with 10+ viewed people
    Deletes all the url's transferfed from the original file
    Parameters: -Org_file: Initial Raw Url File
                -Dest_File: New File you want this to be transferred
                -Num default will be to get all the urls in the original raw file
    """

    li = duplicate(
        '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_raw_url',
        '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_dest_url',
        'txt')

    f = FileProcessor(org_file, 'train')
    lines = f.readFile()
    lines = [line.replace('\n', '') for line in lines if line not in li]
    f.eraseFile()
Beispiel #5
0
            errCount += 1
            print(data[i], x, label[i])
    return (errCount / len(bagOfWords)) * 100

def classification(test_data, test_bagOfWords, original_data, original_labels, original_bagOfWords, k=3):
    """
    kNN Model Based Classifier for test data (actual data)
    """
    for i in range(len(test_bagOfWords)):
        x = classify(np.array(test_bagOfWords[i]), np.array(original_bagOfWords), original_labels, k)
        print(test_data[i], x)

if __name__ == '__main__':
    c = CosineSimilarity()
    t = TextProcessing()
    f = FileProcessor('experience_classification', 'train')
    data, label = f.cleanFile()
    feature_in_category = t.get_feature_in_category(data, label)
    local_neighbour = t.get_local_neighbours(feature_in_category)
    global_neighbour = t.get_global_neighbours(feature_in_category)
    global_words = [global_neighbour[i][0] for i in global_neighbour]
    sorted_local_neighbours = t.get_local_neighbours_sorted(feature_in_category)
    # for i in sorted_local_neighbours:
    #     print(sorted_local_neighbours[i])

    ## Training Data: 4-6%
    revised_data = model_construction(data, global_words, sorted_local_neighbours)
    # for i in revised_data:
    #     print(i)
    vocabSet = c.vocabSet(revised_data)
    bagOfWords = [c.bag_of_words(vocabSet, i) for i in revised_data]
def duplicate(filename1, filename2, type_file):
    """
    Return's a list of duplicates in byte file
    """
    # TODO: This code is terrible!
    if type_file == 'byte':
        items_file1 = FileProcessor(filename1, 'train')
        items_file2 = FileProcessor(filename2, 'train')
        items_file1 = items_file1.readByteFile()
        items_file2 = items_file2.readByteFile()
        # items_file1 = [item for item in items_file1 if item != '\n' and item != None]
        # items_file2 = [item for item in items_file2 if item != '\n' and item != None]
        return [item for item in items_file2 if item in items_file1]
        # return duplicate_list
    elif type_file == 'txt':
        items_file1 = FileProcessor(filename1, 'train')
        items_file2 = FileProcessor(filename2, 'train')
        items_file1 = items_file1.readFile()
        items_file2 = items_file2.readFile()
        # items_file1 = [item for item in items_file1 if item != '\n' and item != None]
        # items_file2 = [item for item in items_file2 if item != '\n' and item != None]
        return [item for item in items_file2 if item in items_file1]
        # return duplicate_list
    else:
        return 'Only byte or txt'
    """
    """
    pass


def recommend():
    """
    """
    pass


if __name__ == '__main__':
    t = TextProcessing()
    c = CosineSimilarity()
    byte_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description'
    byte_file = FileProcessor(byte_file_path, 'train')
    items = byte_file.readByteFile()
    items = [item for item in items if item != '\n' and item != None]

    # big_list = [] # education + experience list
    # for i in items:
    #     print(i)
    #     edu = get_education(i)
    #     exp = get_experience(i)
    #     big_list.append(edu)
    #     big_list.append(exp)
    #
    # vocabSet = c.vocabSet(big_list)
    # wordVectors = [c.bag_of_words(vocabSet, i) for i in big_list]

    ratings_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_rating_backend'
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    # plt.tight_layout()
    plt.show()


def multiplePieGraph():
    """
    Show's all the Pie Graph's at once
    """
    pass


if __name__ == '__main__':
    t = TextProcessing()
    byte_file = FileProcessor(
        '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description',
        'train')
    items = byte_file.readByteFile()
    items = [item for item in items if item != '\n' and item != None]


    # uni_name, degree_name = get_education_data(items)
    # experience, company_name = get_experience_data(items)
    # uni_name, degree_name = clean_data(uni_name), clean_data(degree_name)
    # experience, company_name = clean_data(experience), clean_data(company_name)

    # uni_name_count, degree_name_count, experience_count, company_name_count = count(uni_name), count(
    #     degree_name), count(experience), count(company_name)
    #
    # for i in degree_name: # split the dual degree
    #     if 'and' in i:
Beispiel #9
0
def run():

    settings, args = process_command_line(None)

    try:
        config_rows = read_config_file_as_table(settings.config_file)

        if settings.unified_config_file:
            unified_config_rows = read_config_file_as_table(
                settings.unified_config_file)

    except ConfigException as ex:
        print ex
        return 1

    interaction_type_to_condition_files_dictionary = get_condition_files(
        config_rows)

    signif_dictionary = generate_signif_per_conditions_dictionary(config_rows)
    # max_lib_dictionary = generate_max_lib_per_condition_dictionary(config_rows)

    # Mark whether option was set
    categories_lists = {}

    # categories_lists = {DIV_OPTION_SIGNIF: signif_dictionary}
    if settings.signif:
        categories_lists[DIV_OPTION_SIGNIF] = signif_dictionary

    if settings.known_targets_file:
        known_pair_list = generate_known_pairs_list(
            settings.known_targets_file)
        categories_lists[DIV_OPTION_KNOWN_TARGETS] = known_pair_list

    if settings.no_binding_file:
        binding_pair_list = generate_binding_pairs_list(
            settings.no_binding_file)
        categories_lists[DIV_OPTION_BINDING] = binding_pair_list

    if settings.questionable_file:
        questionable_pair_list = generate_questionable_pairs_list(
            settings.questionable_file)
        categories_lists[DIV_OPTION_QUESTIONABLE] = questionable_pair_list

    interaction_to_condition_totals = {}
    interaction_to_condition_libs = {}
    interaction_to_condition_signif_libs = {}

    # Go over the files and make the total count of interactions per interaction type
    for interaction_type, condition_files_dictionary in interaction_type_to_condition_files_dictionary.items(
    ):

        interaction_to_condition_totals[interaction_type] = {}
        interaction_to_condition_libs[interaction_type] = {}
        interaction_to_condition_signif_libs[interaction_type] = {}

        for condition_name, file_list in condition_files_dictionary.items():

            interaction_to_condition_totals[interaction_type][
                condition_name] = {}
            interaction_to_condition_libs[interaction_type][
                condition_name] = {}
            interaction_to_condition_signif_libs[interaction_type][
                condition_name] = {}

            for file_path in file_list:

                fp = FileProcessor(
                    file_path, {
                        "read_count": CountHandler(),
                        "signif_lib_count": MaxLibHandler()
                    })

                fp.process()

                interactions_dct = fp.row_handlers["read_count"].dct
                max_signif_lib_dct = fp.row_handlers["signif_lib_count"].dct

                # Handle total count
                for name, count in interactions_dct.items():

                    # Add the name for the first time
                    if name not in interaction_to_condition_totals[
                            interaction_type][condition_name]:
                        interaction_to_condition_totals[interaction_type][
                            condition_name][name] = 0

                    # Increase the count
                    interaction_to_condition_totals[interaction_type][
                        condition_name][name] += count

                # Handle signif libs
                for name, lib_count in max_signif_lib_dct.items():

                    # Add the name for the first time
                    if name not in interaction_to_condition_signif_libs[
                            interaction_type][condition_name]:
                        interaction_to_condition_signif_libs[interaction_type][
                            condition_name][name] = 0

                    # Set the maximal lib
                    interaction_to_condition_signif_libs[interaction_type][condition_name][name] = \
                     max(interaction_to_condition_signif_libs[interaction_type][condition_name][name], lib_count)

                # Handle total libs
                for name, lib_count in interactions_dct.items():

                    # Add the name for the first time
                    if name not in interaction_to_condition_libs[
                            interaction_type][condition_name]:
                        interaction_to_condition_libs[interaction_type][
                            condition_name][name] = 0

                    # Set the maximal lib
                    interaction_to_condition_libs[interaction_type][
                        condition_name][name] += 1

    # Only if unified config file was inserted
    if settings.unified_config_file:

        unified_interaction_type_to_condition_files_dictionary = get_condition_files(
            unified_config_rows)

        unified_interaction_to_condition_totals = {}
        unified_interaction_to_condition_libs = {}

        for interaction_type, condition_files_dictionary in unified_interaction_type_to_condition_files_dictionary.items(
        ):

            unified_interaction_to_condition_totals[interaction_type] = {}
            unified_interaction_to_condition_libs[interaction_type] = {}

            for condition_name, file_list in condition_files_dictionary.items(
            ):

                unified_interaction_to_condition_totals[interaction_type][
                    condition_name] = {}

                for file_path in file_list:

                    fp = FileProcessor(
                        file_path, {
                            "lib_count":
                            UnifiedLibHandler(
                                interaction_to_condition_libs[interaction_type]
                                [condition_name])
                        })

                    fp.process()

                    unified_interaction_to_condition_libs[interaction_type][condition_name] = \
                     fp.row_handlers["lib_count"].dct

    # Generate group tables per condition
    for interaction_type, condition_totals in interaction_to_condition_totals.items(
    ):

        perms = []

        # Generate all permutations for the flags
        for i in range(len(categories_lists) + 1):
            values = [True] * i + [False] * (len(categories_lists) - i)
            perms += list(set(itertools.permutations(values)))

        # Generate the different groups according to t
        for permutation in perms:

            categories_results = {
                category: permutation[i]
                for i, category in enumerate(categories_lists.keys())
            }

            generate_table(
                interaction_type, condition_totals,
                interaction_to_condition_libs[interaction_type],
                interaction_to_condition_signif_libs[interaction_type],
                categories_results, categories_lists)

    # Generate unified tables
    if settings.unified_config_file:

        for interaction_type, condition_to_libs in unified_interaction_to_condition_libs.items(
        ):

            for condition_name, names_to_libs in condition_to_libs.items():

                with open(
                        "output/unified_%s_%s.table" %
                    (interaction_type, condition_name), "wb") as fl:

                    writer = csv.writer(fl,
                                        delimiter='\t',
                                        quotechar='|',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow(["libs", "known", "unknown"])

                    libs_max = max(
                        set([int(val) for key, val in names_to_libs.items()]))

                    for i in range(libs_max + 1):
                        writer.writerow([
                            i,
                            sum([
                                1 for name, val in names_to_libs.items()
                                if val == i and is_known(
                                    name,
                                    categories_lists[DIV_OPTION_KNOWN_TARGETS])
                            ]),
                            sum([
                                1 for name, val in names_to_libs.items()
                                if val == i and not is_known(
                                    name,
                                    categories_lists[DIV_OPTION_KNOWN_TARGETS])
                            ])
                        ])

    return 0
Beispiel #10
0
def run():

	settings, args = process_command_line(None)

	try:
		config_rows = read_config_file_as_table(settings.config_file)

		if settings.unified_config_file:
			unified_config_rows = read_config_file_as_table(settings.unified_config_file)

	except ConfigException as ex:
		print ex
		return 1

	interaction_type_to_condition_files_dictionary = get_condition_files(config_rows)

	signif_dictionary = generate_signif_per_conditions_dictionary(config_rows)
	# max_lib_dictionary = generate_max_lib_per_condition_dictionary(config_rows)

	# Mark whether option was set
	categories_lists = {}

	# categories_lists = {DIV_OPTION_SIGNIF: signif_dictionary}
	if settings.signif:
		categories_lists[DIV_OPTION_SIGNIF] = signif_dictionary

	if settings.known_targets_file:
		known_pair_list = generate_known_pairs_list(settings.known_targets_file)
		categories_lists[DIV_OPTION_KNOWN_TARGETS] = known_pair_list

	if settings.no_binding_file:
		binding_pair_list = generate_binding_pairs_list(settings.no_binding_file)
		categories_lists[DIV_OPTION_BINDING] = binding_pair_list

	if settings.questionable_file:
		questionable_pair_list = generate_questionable_pairs_list(settings.questionable_file)
		categories_lists[DIV_OPTION_QUESTIONABLE] = questionable_pair_list

	interaction_to_condition_totals = {}
	interaction_to_condition_libs = {}
	interaction_to_condition_signif_libs = {}

	# Go over the files and make the total count of interactions per interaction type
	for interaction_type, condition_files_dictionary in interaction_type_to_condition_files_dictionary.items():

		interaction_to_condition_totals[interaction_type] = {}
		interaction_to_condition_libs[interaction_type] = {}
		interaction_to_condition_signif_libs[interaction_type] = {}

		for condition_name, file_list in condition_files_dictionary.items():

			interaction_to_condition_totals[interaction_type][condition_name] = {}
			interaction_to_condition_libs[interaction_type][condition_name] = {}
			interaction_to_condition_signif_libs[interaction_type][condition_name] = {}

			for file_path in file_list:

				fp = FileProcessor(file_path, {"read_count": CountHandler(),
											   "signif_lib_count": MaxLibHandler()})

				fp.process()

				interactions_dct = fp.row_handlers["read_count"].dct
				max_signif_lib_dct = fp.row_handlers["signif_lib_count"].dct

				# Handle total count
				for name, count in interactions_dct.items():

					# Add the name for the first time
					if name not in interaction_to_condition_totals[interaction_type][condition_name]:
						interaction_to_condition_totals[interaction_type][condition_name][name] = 0

					# Increase the count
					interaction_to_condition_totals[interaction_type][condition_name][name] += count

				# Handle signif libs
				for name, lib_count in max_signif_lib_dct.items():

					# Add the name for the first time
					if name not in interaction_to_condition_signif_libs[interaction_type][condition_name]:
						interaction_to_condition_signif_libs[interaction_type][condition_name][name] = 0

					# Set the maximal lib
					interaction_to_condition_signif_libs[interaction_type][condition_name][name] = \
						max(interaction_to_condition_signif_libs[interaction_type][condition_name][name], lib_count)

				# Handle total libs
				for name, lib_count in interactions_dct.items():

					# Add the name for the first time
					if name not in interaction_to_condition_libs[interaction_type][condition_name]:
						interaction_to_condition_libs[interaction_type][condition_name][name] = 0

					# Set the maximal lib
					interaction_to_condition_libs[interaction_type][condition_name][name] += 1


	# Only if unified config file was inserted
	if settings.unified_config_file:

		unified_interaction_type_to_condition_files_dictionary = get_condition_files(unified_config_rows)

		unified_interaction_to_condition_totals = {}
		unified_interaction_to_condition_libs = {}

		for interaction_type, condition_files_dictionary in unified_interaction_type_to_condition_files_dictionary.items():

			unified_interaction_to_condition_totals[interaction_type] = {}
			unified_interaction_to_condition_libs[interaction_type] = {}

			for condition_name, file_list in condition_files_dictionary.items():

				unified_interaction_to_condition_totals[interaction_type][condition_name] = {}

				for file_path in file_list:

					fp = FileProcessor(file_path, {"lib_count": UnifiedLibHandler(interaction_to_condition_libs[interaction_type][condition_name])})

					fp.process()

					unified_interaction_to_condition_libs[interaction_type][condition_name] = \
						fp.row_handlers["lib_count"].dct

	# Generate group tables per condition
	for interaction_type, condition_totals in interaction_to_condition_totals.items():

		perms = []

		# Generate all permutations for the flags
		for i in range(len(categories_lists) + 1):
			values = [True] * i + [False] * (len(categories_lists) - i)
			perms += list(set(itertools.permutations(values)))

		# Generate the different groups according to t
		for permutation in perms:

			categories_results = {category: permutation[i] for i, category in enumerate(categories_lists.keys())}

			generate_table(interaction_type,
						   condition_totals,
						   interaction_to_condition_libs[interaction_type],
						   interaction_to_condition_signif_libs[interaction_type],
						   categories_results,
						   categories_lists)

	# Generate unified tables
	if settings.unified_config_file:

		for interaction_type, condition_to_libs in unified_interaction_to_condition_libs.items():

			for condition_name, names_to_libs in condition_to_libs.items():

				with open("output/unified_%s_%s.table" % (interaction_type, condition_name), "wb") as fl:

					writer = csv.writer(fl, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
					writer.writerow(["libs", "known", "unknown"])

					libs_max = max(set([int(val) for key, val in names_to_libs.items()]))

					for i in range(libs_max + 1):
						writer.writerow([i,
										 sum([1 for name, val in names_to_libs.items() if val == i and is_known(name, categories_lists[DIV_OPTION_KNOWN_TARGETS])]),
										 sum([1 for name, val in names_to_libs.items() if val == i and not is_known(name, categories_lists[DIV_OPTION_KNOWN_TARGETS])])])

	return 0