def main():

    # get paths to all SpatialML files of required format
    all_files = os.listdir(config.SPATIALML_CORPUS_DIR)
    files_wanted = [file for file in all_files if file.endswith(config.SPATIALML_FILE_SUFFIX)]

    # TESTING
    # files_wanted = [files_wanted[4]]

    # for each file strip unwanted tags and write result to a file with the same name in the simple SpatialML directory
    for filename in files_wanted:

        # parse as xml
        content = utilities.read_from_file(config.SPATIALML_CORPUS_DIR + filename)
        soup = BeautifulSoup(content, 'xml')

        # unwrap all unneeded tags (replace with contents)
        for tag in soup.find_all('LINK') + soup.find_all(('RLINK')) + soup.find_all('SIGNAL'):
            tag.unwrap()

        # unwrap nominal place tags (= tags of nominal references eg. 'city')
        for tag in soup.find_all('PLACE', attrs={'form': 'NOM'}):
            tag.unwrap()

        # unwrap predicative place tags (= tags of e.g. 'Japanese' rather than 'Japan')
        for tag in soup.find_all('PLACE', attrs={'predicative': 'true'}):
            tag.unwrap()

        # write to file with same name in simple SpatialML directory
        utilities.write_to_file(config.SPATIALML_SIMPLE_DIR + filename, str(soup))
def main():

    # get paths to all SpatialML files of required format
    all_files = os.listdir(config.SPATIALML_CORPUS_DIR)
    files_wanted = [file for file in all_files if file.endswith(config.SPATIALML_FILE_SUFFIX)]

    # for each file obtain just the text and write this to a file with the same name in the raw SpatialML directory
    for filename in files_wanted:
        content = utilities.read_from_file(config.SPATIALML_CORPUS_DIR + filename)
        soup = BeautifulSoup(content, "xml")
        text = soup.get_text()
        utilities.write_to_file(config.SPATIALML_RAW_DIR + filename, text)
def test_write_to_file():
    file_name = 'test_write_to_file.wav'
    audio_bytes = b'test_write_to_file.wav'
    file_path = write_to_file(file_name, audio_bytes)
    assert os.path.exists(file_path)
    os.remove(file_path)
    assert not os.path.exists(file_path)
    assert file_path == os.path.join(os.getcwd(), file_name)
Example #4
0
def create_train2id_100000e():
    train2id_path = "./FB15K/test2id.txt"
    train2id_100000 = read_data2id_partly(train2id_path)
    print(len(train2id_100000))
    print(train2id_100000[0])
    write_to_file('./FB15K/test2id_10000.txt', train2id_100000)
Example #5
0
def get_word_bag(train2id, relation2id, entity_description_obj):
    print("get_triples_description - begin ... \n")
    """
    重要,不要删除
    train2id : the index of train data
    relation2id: the index of relation
    entity_description_obj : entity object which contains id, symbol, name, description, neighbours.
    word_bag : the number of words of relation and entity
    pre_word_embedding: the word-vector of all words
    return: pre-description embedding.
    """

    # number_of_entity = len(entity_description_obj)
    # print(number_of_entity)
    # all_entity_description_list = []
    # for i in range(number_of_entity):
    #     tmp_en = entity_description_obj[i]
    #
    #     entity_str = tmp_en.id + '\t' + tmp_en.symb + '\t' + tmp_en.label + '\t' + tmp_en.description
    #     print(entity_str)
    #
    #     en_des_word_list = tmp_en.get_entity_description()
    #
    #     all_entity_description_list.append(en_des_word_list)

    word_list = ["NULL"]
    head_description_list = []
    # relation_description_list = []
    # tail_description_list = []

    for i in range(len(train2id)):
        print(i)

        # head_description_list = []

        # print(i," --> ",train2id[i],"\n")
        # if i == 10:
        #     print("i = 10 break !")
        #     break

        head_index = int(train2id[i][0])
        tail_index = int(train2id[i][1])
        relation_index = int(train2id[i][2])

        head_obj = entity_description_obj[head_index]

        tail_obj = entity_description_obj[tail_index]

        rela_des = relation2id[relation_index][0]

        # head_description = head_obj.get_des()
        relation_description = str(
            rela_des) + ', ' + 'which is between ' + head_obj.symb + ' and ' + tail_obj.symb + ';' \
                               + head_obj.get_random_neighbour() + ';' + tail_obj.get_random_neighbour()
        # tail_description = tail_obj.get_des()

        # text_process(head_description)

        # print(head_description,"\n")
        # print(relation_description,"\n")
        # print(tail_description,"\n")
        # print("===================")
        """
        obtain entity and relation description represented by word
        """
        # head_description_word_list = entity_text_process(head_description)
        # relation_description_word_list = relation_text_process(relation_description)
        # tail_description_word_list = entity_text_process(tail_description)
        #
        head_description_word_list = head_obj.get_entity_description()
        relation_description_word_list = relation_text_process(
            relation_description)
        tail_description_word_list = tail_obj.get_entity_description()
        """ create word-bag , I have obtain word list , and obtain each word embedding using glove"""
        word_list += head_description_word_list
        word_list += relation_description_word_list
        word_list += tail_description_word_list
        word_list = list(set(word_list))
        """ next , all words become vector using World2vector 
            将获取的head_description_word_list,relation_description_word_list,tail_description_word_list
            通过word词模型,变成向量,然后使用LSTM进行编码
            from get_word2vector import get_word2vec 
        """

        # print("\n head description pre-vector... \n")
        # print(head_description_word_list)
        # print(np.array(pre_word_embedding[[word_bag[x] for x in head_description_word_list]]))
        #
        # print("\n relation description pre-vector... \n")
        # print(relation_description_word_list)
        # print(np.array(pre_word_embedding[[word_bag[x] for x in relation_description_word_list]]))
        #
        # print("\n tail description pre-vector... \n")
        # print(tail_description_word_list)
        # print(np.array(pre_word_embedding[[word_bag[x] for x in tail_description_word_list]]))

        # get sentence embedding

        head_description_list.append(head_description_word_list)
        # relation_description_list.append(relation_description_word_list)
        # tail_description_list.append(tail_description_word_list)

        # print("head_description_list",head_description_list)

    # get_sentence_init_embedding(pre_word_embedding,word_list,head_description_list)

    write_to_file("./FB15K/word_bag_2.txt", word_list)
Example #6
0
def naive_implementation_shelve(input_file: str,
                                output_file: str = None) -> str:
    if not os.path.exists(input_file):
        raise FileNotFoundError

    if not output_file:
        output_file = generate_output_file_path(input_file, 'naive_shelve')
    tmp_dir = RunningConstants.TMP_DIR.value
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)
    dict_file_path = os.path.join(tmp_dir, 'shelve_naive_dict')
    activities_map = shelve.open(dict_file_path, writeback=True)
    task_start_time = time.time()

    with open(input_file, mode='r', encoding='utf-8') as file:
        with mmap.mmap(file.fileno(), length=0,
                       access=mmap.ACCESS_READ) as mmap_obj:
            try:
                chunks = mmap_obj.read().decode() + '\n'
                idx = 0
                line = ''
                while idx < len(chunks):
                    if not (chunks[idx] == '\n'):
                        line += chunks[idx]
                    else:
                        if 'Driver' in line:
                            activities_map[line[7:]] = (0, 0)
                        elif 'Trip' in line:
                            driver_name, start_time, end_time, distance = line[
                                5:].split()
                            time_spent = get_time_spent(start_time, end_time)
                            distance = float(distance)
                            current_trip_speed = (distance * 3600) / time_spent
                            if 5 <= current_trip_speed <= 100:
                                if driver_name not in activities_map:
                                    activities_map[driver_name] = (0, 0)
                                prev_distance, prev_time = activities_map[
                                    driver_name]
                                activities_map[driver_name] = (prev_distance +
                                                               distance,
                                                               prev_time +
                                                               time_spent)
                        line = ''
                    idx += 1
            except ZeroDivisionError as zde:
                print('Time-Spent maybe zero: ' + str(time_spent))
                print("Error: ", ex)
            except KeyError as ke:
                print('Driver Name may not be in the central dictionary: ')
                print("Error: ", ex)
            except IOError as ioe:
                print('I/O Error({0}): {1}'.format(ioe.errno, ioe.strerror))
                print("Error: ", ex)
            except Exception as ex:
                print("Error: ", ex)
        task_total_time = time.time() - task_start_time
        # Preparing the final dataset and completing the output file
        write_to_file(output_file, activities_map)
        log_appender({
            'input_file': input_file,
            'output_file': output_file,
            'time_taken': task_total_time
        })

        # clean up of shelve, tmp directories
        activities_map.close()
        shutil.rmtree(tmp_dir)
        return output_file
Example #7
0
def multiprocessing_implementation_shelve(input_file: str, output_file: str=None,
        tmp_dir:str = None, chunk_size: int = None) -> str:
    """Implementation using python's multiprocessing library and shelve

    This module is distinguished from the naive implementation in that it
    exploits the available CPU cores to parallely read the file chunks in an
    attempt to speedup the data-wrangling tasks. And, it's distinguished from the
    multiprocessing_implementation.py file in that it uses Python's shelve over
    Counter as persistent, dynamic key-value store. For detailed analysis refer to
    the Readme.md file.

    For usage pattern and motivating examples, refer to the test folder.

    Parameters
    ----------
    input_file*: Support both relative, absolute references to the input file location.
    output_file: Support both relative, absolute references to the output file location.
    chunk_size: Hyperparameter (supporting tradeoff) controling block size read by each core.

    (* - Required parameters)

    Returns
    -------
    output file location (for sharing with downstream tasks, if needed).
    """
    # Returning error should the supplied file be missing
    if not os.path.exists(input_file):
        raise FileNotFoundError

    if not output_file:
        output_file = generate_output_file_path(input_file, 'mproc')

    # Making use of the existing cpu cores
    cores = mp.cpu_count()
    pool = mp.Pool(cores)
    task_splits = []
    task_queues = mp.Manager().Queue()
    dict_file_path = os.path.join(tmp_dir, 'shelve_dict')
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)
    # Registering the shelve in the working directory
    chunk_activities = shelve.open(dict_file_path)
    task_start_time = time.time()

    # Job Splitter: Processing chunked file-splits
    for chunk_start, chunkSize in extract_chunk_info(input_file, chunk_size):
        task_splits += pool.apply_async(master_node, (chunk_start, chunkSize, input_file, task_queues)),

    # Job Merge: Catching up with the yet-to-finish processes
    while len(task_splits) > 0:
        task_splits.pop(0).get()
    task_queues.put(RunningConstants.QUEUE_END_FLAG.value)

    # Processing the returned key-values across splits
    for split_dict in iter(task_queues.get, RunningConstants.QUEUE_END_FLAG.value):
        if len(split_dict):
            for driver_name, (distance, time_spent) in iter(split_dict.items()):
                if driver_name not in chunk_activities:
                    chunk_activities[driver_name] = (distance, time_spent)
                else:
                    prev_distance, prev_time = chunk_activities[driver_name]
                    chunk_activities[driver_name] = (prev_distance+distance,
                        prev_time + time_spent)
    task_total_time = time.time() - task_start_time
    # Preparing the final dataset and completing the output file
    write_to_file(output_file, chunk_activities)
    log_appender({'input_file': input_file, 'output_file': output_file,
        'time_taken': task_total_time})

    # clean up of pool, shelve, tmp directories
    pool.close()
    chunk_activities.close()
    shutil.rmtree(tmp_dir)
    return output_file
			innerf2.append(f1_score(y_v,ipred))
		#compute the average
		RFinnerscore.append(sum(innerf2)/len(innerf2))

	best_n_estimator = n_estimatorsvalues[np.argmax(RFinnerscore)]
	
	#predict the labels for the test set using best c parameter
	labels_predictions =random_forest_gini(features_train,labels_train,features_test,best_n_estimator)   
	
	#calculating the performance
	rf_accuracy = accuracy_score(labels_test,labels_predictions)
	rf_f1 = f1_score(labels_test,labels_predictions)
	rf_auc_roc = roc_auc_score(labels_test,labels_predictions)
	rf_ave_preci = average_precision_score(labels_test,labels_predictions)
  
	#appending the results to the list for computing the average performance of SVM
	accuracy_RF.append(rf_accuracy)
	f1_RF.append(rf_f1)
	auc_roc_RF.append(rf_auc_roc)
	
#Record per fold performance results in a file
write_to_file("rf",accuracy_RF,f1_RF,auc_roc_RF)

#compute Average performance measure For GaussianNaiveBayes
print "\n##Average performance measure for SVM with a RBF kernel##\n"
print "\tAverage Accuracy: = "+ average_performance_metric(accuracy_RF)
print "\tAverage F1 Score: = "+ average_performance_metric(f1_RF)
print "\tAverage AUC ROC:  = "+ average_performance_metric(auc_roc_RF)
print

def naive_implementation(input_file: str, output_file: str = None) -> str:
    """Naive implementation
    This module implements naively the task of generating the report comprised
    of the total driven miles and the average speed. All numerical data reported
    have been rounded to the nearest integer.

    For usage pattern and motivating examples, refer to the test folder.

    Parameters
    ----------
    input_file*: Support both relative, absolute references to the input file location.
    output_file: Support both relative, absolute references to the output file location.

    (* - Required parameters)

    Returns
    -------
    output file location (for sharing with downstream tasks, if needed).
    """
    # Returning error should the supplied file be missing
    if not os.path.exists(input_file):
        raise FileNotFoundError

    if not output_file:
        output_file = generate_output_file_path(input_file, 'naive')

    activities_map = Counter()  # Registering the chunk counter
    task_start_time = time.time()
    with open(input_file, mode='r', encoding='utf-8') as file:
        with mmap.mmap(file.fileno(), length=0,
                       access=mmap.ACCESS_READ) as mmap_obj:
            try:
                chunks = mmap_obj.read().decode() + '\n'
                idx = 0
                line = ''
                while idx < len(chunks):
                    if not (chunks[idx] == '\n'):
                        line += chunks[idx]
                    else:
                        if 'Driver' in line:
                            activities_map[line[7:]] = (0, 0)
                        elif 'Trip' in line:
                            driver_name, start_time, end_time, distance = line[
                                5:].split()
                            time_spent = get_time_spent(start_time, end_time)
                            distance = float(distance)
                            current_trip_speed = (distance * 3600) / time_spent
                            if 5 <= current_trip_speed <= 100:
                                if driver_name not in activities_map:
                                    activities_map[driver_name] = (0, 0)
                                # Processing the key-values
                                prev_distance, prev_time = activities_map[
                                    driver_name]
                                activities_map[driver_name] = (prev_distance +
                                                               distance,
                                                               prev_time +
                                                               time_spent)
                        line = ''
                    idx += 1
            except ZeroDivisionError as zde:
                print('Time-Spent maybe zero: ' + str(time_spent))
                print("Error: ", ex)
            except KeyError as ke:
                print('Driver Name may not be in the central dictionary: ')
                print("Error: ", ex)
            except IOError as ioe:
                print('I/O Error({0}): {1}'.format(ioe.errno, ioe.strerror))
                print("Error: ", ex)
            except Exception as ex:
                print("Error: ", ex)
        task_total_time = time.time() - task_start_time

        # Preparing the final dataset and completing the output file
        write_to_file(output_file, activities_map)
        log_appender({
            'input_file': input_file,
            'output_file': output_file,
            'time_taken': task_total_time
        })
        return output_file
	features_train = dataset[0][train_index]
	labels_train = dataset[1][train_index]
	
	features_test = dataset[0][test_index]	
	labels_test =  dataset[1][test_index]

	# Training the model, using the Gaussian Navie Bayes classifier
	labels_predictions = gaussian_naive_bayes(features_train,labels_train,features_test)   

	 # Evaluate the performance of the learned model
	gnb_Accuracy= accuracy_score(labels_test, labels_predictions)
	gnb_f1 = f1_score(labels_test,labels_predictions)
	gnb_auc_roc = roc_auc_score(labels_test,labels_predictions)
	
		
	#Store the result for averaged computation
	accuracy_GNB.append(gnb_Accuracy)
	f1_GNB.append(gnb_f1)
	auc_roc_GNB.append(gnb_auc_roc)

#Record per fold performance results in a file
write_to_file("gnb",accuracy_GNB,f1_GNB,auc_roc_GNB)

#compute Average performance measure For GaussianNaiveBayes
print "\n##Average performance measure For GaussianNaiveBayes##\n"
print "\tAverage Accuracy: = "+ average_performance_metric(accuracy_GNB)
print "\tAverage F1 Score: = "+ average_performance_metric(f1_GNB)
print "\tAverage AUC ROC:  = "+ average_performance_metric(auc_roc_GNB)
print

Example #11
0
from node import Node
import utilities
import time
import depth_first
# Take puzzle in as input from user
# Split input by spaces and convert all elements to int
puzzle = [int(x) for x in input().split()]

root_node = Node(puzzle, None, '0', 0)

#DFS
start_time = time.time()
goal_node = depth_first.search(root_node)
solution_path = utilities.find_solution_path(goal_node)
utilities.write_to_file(solution_path, 'puzzleDFS.txt')
print("Time to finish DFS: {}".format(time.time() - start_time))
Example #12
0
# 0 2 3 4 1 6 7 8 5 9 10 11 <- solved instantaneously
# 0 2 3 4 1 5 7 8 6 9 10 11 <- solved instantaneously
# 9 0 6 1 3 10 8 11 2 5 7 4 <- solved instantaneously
# 1 0 3 7 5 2 6 4 9 10 11 8 <- puzzle from handout

#print ("Is puzzle valid? "+str(utilities.valid(root_node)))
#print ("Is puzzle in goal state? "+str(utilities.goal(root_node)))
#print ("Manhattan distance: "+str(heuristics.manhattan_distance(root_node)))
#print ("SPI: "+str(heuristics.sum_of_permutation_inversion(root_node)))
#print ("Hamming distance: "+str(heuristics.hamming_distance(root_node)))
#BFS
#HD
start_time = time.time()
goal_node = best_first.searchHD(root_node)
solution_path = utilities.find_solution_path(goal_node)
utilities.write_to_file(solution_path, 'puzzleBFS-h1.txt')
print("Time to finish BFS hamming distance: {}".format(time.time() -
                                                       start_time))
#MD
start_time = time.time()
goal_node = best_first.searchMD(root_node)
solution_path = utilities.find_solution_path(goal_node)
utilities.write_to_file(solution_path, 'puzzleBFS-h2.txt')
print("Time to finish BFS Manhattan distance: {}".format(time.time() -
                                                         start_time))
#A*
#SPI
#start_time = time.time()
#goal_node = a_star.searchSPI(root_node)
#solution_path = utilities.find_solution_path(goal_node)
#utilities.write_to_file(solution_path, 'puzzleAs-h1.txt')
		#compute the average
		innerscore.append(sum(innerf1)/len(innerf1))
	
	#pick C that give the best f1	
	bestC=C[np.argmax(innerscore)]
	print ""
	#predict the labels for the test set using best c parameter
	labels_predictions =svm_rbf(features_train,labels_train,features_test,bestC)   
	
	#calculating the performance
	svm_accuracy = accuracy_score(labels_test,labels_predictions)
	svm_f1 = f1_score(labels_test,labels_predictions)
	svm_auc_roc = roc_auc_score(labels_test,labels_predictions)
	svm_ave_preci = average_precision_score(labels_test,labels_predictions)
  
	#appending the results to the list for computing the average performance of SVM
	accuracy_SVM.append(svm_accuracy)
	f1_SVM.append(svm_f1)
	auc_roc_SVM.append(svm_auc_roc)
	
#Record per fold performance results in a file
write_to_file("svm",accuracy_SVM,f1_SVM,auc_roc_SVM)

#compute Average performance measure For GaussianNaiveBayes
print "\n##Average performance measure for SVM with a RBF kernel##\n"
print "\tAverage Accuracy: = "+ average_performance_metric(accuracy_SVM)
print "\tAverage F1 Score: = "+ average_performance_metric(f1_SVM)
print "\tAverage AUC ROC:  = "+ average_performance_metric(auc_roc_SVM)
print

def map_locations(url=None, file=None, display_map=False):
    """ Main logic of program, perform entire pipeline on the text indicated by the command line arguments given,
        writing each stage of the pipeline to files in the results directory. """

    # exit if neither url nor file given
    if url is None and file is None:
        print("A url or file must be given to read content to process from, see help (-h or --help option) for more "
              "information.")
        exit(1)

    # starting message
    loc = url if file is None else file
    print("Starting map_locations for {}...".format(loc))

    # obtain the content to process
    if file is not None:
        # read content from file
        print("Reading article from file...")
        title = file
        content = utilities.read_from_file(file)

    elif url is not None:
        # make request to Readability API for url
        print("Obtaining article from url...")
        readability_response = readability_interface.readability_request(url)
        title = readability_response['title']
        html_content = readability_response['content']
        content = BeautifulSoup(html_content).get_text()

    # form results directory for article
    print("Forming results directory for article...")
    results_dir = make_results_dir(title)

    # store content of article
    print("Writing article content to file...")
    content_file = results_dir + '01_content.txt'
    utilities.write_to_file(content_file, content)

    # tag file using Stanford CoreNLP server
    print("Tagging named entities in article...")
    try:
        corenlp_tagged_text = corenlp_interface.corenlp_tag_text(content)
    except ConnectionRefusedError as ex:
        # print (most likely) reason for error, trace, and quit
        print("Stanford CoreNLP server must be run to tag named entities! (settings in config.py)")
        ex.with_traceback()

    # store tagged article
    print("Writing tagged article to file...")
    corenlp_tagged_file = results_dir + '02_corenlp_tagged.xml'
    utilities.write_to_file(corenlp_tagged_file, corenlp_tagged_text)

    # disambiguate identified locations to find most likely candidate (candidates written to files in disambiguate())
    print("Disambiguating identified locations...")
    identified_locations = identification.identify(corenlp_tagged_text, results_dir)


    # print("\n********************", identified_locs_to_xml(identified_locations, corenlp_tagged_text), "*******************\n")


    # form kml for identified locations
    print("Creating kml for article locations...")
    kml = kml_generation.create_kml(identified_locations)

    print("Writing kml to file...")
    relative_kml_file = '04_kml.kml'
    kml_file = results_dir + relative_kml_file
    utilities.write_to_file(kml_file, kml)

    print("Creating html files for map...")

    # map html file
    with open(config.CONTEXT_DIR + config.MAP_VIEW_TEMPLATE) as template_file:
        template = string.Template(template_file.read())
        html = template.substitute(kml_file=relative_kml_file, title=title)
        map_html_file = results_dir + '05_map_view.html'
        utilities.write_to_file(map_html_file, html)

    # article html file
    with open(config.CONTEXT_DIR + config.ARTICLE_TEMPLATE) as template_file:
        template = string.Template(template_file.read())

        # Form article content html, adding bold tags around identified locations.
        # find positions of all ided locs and add bold tags in reverse order so positions don't shift
        content_html_list = list(content)
        positions = {}
        for ided_loc in identified_locations:
            positions[ided_loc.start] = ided_loc.stop

        start_positions = reversed(sorted(positions.keys()))
        for start_pos in start_positions:
            stop_pos = positions[start_pos]
            content_html_list.insert(stop_pos-1, '</b>')
            content_html_list.insert(start_pos-1, '<b>')

        # replace newlines with paragraphs
        for index, el in enumerate(content_html_list):
            if el == '\n':
                content_html_list[index] = '<p>'

        content_html = ''.join(content_html_list)

        # create and save the html
        html = template.substitute(article_title=title, article_content=content_html)
        article_html_file = results_dir + '06_identified_locs.html'
        utilities.write_to_file(article_html_file, html)

    if display_map:
        print("Opening map...")
        # webbrowser.open_new_tab(article_html_file)
        webbrowser.open_new_tab(map_html_file)

    print("Map: file://" + map_html_file)

    print("map_locations successfully completed for {}.\n".format(loc))